from fastcore.test import test_eqContent Parser
metadata Parsing
Utilities for extracting frontmatter metadata from markdown and Jupyter notebook files.
parse_metadata
def parse_metadata(
content:str, # Raw markdown content with YAML frontmatter
)->dict:
Extract metadata from YAML frontmatter.
parse_notebook_metadata
def parse_notebook_metadata(
content:str, # Raw Jupyter notebook JSON string
)->dict:
Extract metadata from the first cell of a Jupyter notebook.
# Test Parse metadat
from pathlib import Path
sample_dir = Path("sample")
if not sample_dir.exists():
sample_dir = Path("../sample")
with open(sample_dir / "example.md", "r") as file:
content = file.read()
metadata = parse_metadata(content)
# content is .ipynb
with open(sample_dir / "design_questions.ipynb", "r") as f:
nb_content = f.read()
nb_metadata = parse_notebook_metadata(nb_content)
print(nb_metadata){}
Notebook Parsing
Helpers for filtering and extracting content from Jupyter notebook cells.
is_frontmatter
def is_frontmatter(
cell:dict, # Notebook cell dict
)->bool:
Check if a cell is a YAML frontmatter cell.
is_visible_code
def is_visible_code(
cell:dict, # Notebook cell dict
is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->bool:
Check if a code cell should be included in output.
extract_notebook_content
def extract_notebook_content(
content:str, # Raw Jupyter notebook JSON string
is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->str:
Extract visible markdown and code content from a Jupyter notebook.
remove_metadata
def remove_metadata(
content:str
)->str:
Remove frontmatter from content
Content Checks
Functions for validating SEO-relevant content lengths for titles, descriptions, and body text.
check_content_length
def check_content_length(
content:str, # Page body content
)->dict:
Check if page content has sufficient word count for SEO.
check_desc_length
def check_desc_length(
description:str, # Meta description to check
min_len:int=150, # Minimum optimal length
max_len:int=160, # Maximum optimal length
)->dict:
Check if a meta description length is optimal for SEO.
check_title_length
def check_title_length(
title:str, # Page title to check
min_len:int=50, # Minimum optimal length
max_len:int=60, # Maximum optimal length
)->dict:
Check if a page title length is optimal for SEO.
check_length
def check_length(
text:str, # Text to check
min_len:int, # Minimum optimal length
max_len:int, # Maximum optimal length
)->dict:
Check if text length falls within the optimal range.
Content Extraction
Functions for extracting headers, links, and images from markdown and HTML content.
extract_headers
def extract_headers(
file_path:str, # Path to markdown file
)->list:
Extract all headers with their level, line number, content, and length.
headers = extract_headers(str(sample_dir / "example.md"))
test_eq(len([h for h in headers if h["type"] == "h1"]), 2)
test_eq(headers[0]["content"], "This is me Kareem")extract_links
def extract_links(
content:str
)->dict:
Extract all links with metadata
links = extract_links(content)
test_eq("https://emdadelgaz.com" in links, True)
test_eq("https://awazly.com/" in links, True)extract_images
def extract_images(
content:str, # Markdown or HTML content
)->list:
Extract images with alt text from markdown and HTML.
imgs_missing_alts
def imgs_missing_alts(
images:list, # List of image dicts from [`extract_images`](https://abdelkareemkobo.github.io/seo_rat/content_parser.html#extract_images)
)->list:
Return URLs of images missing alt text.
images = extract_images(content)
test_eq(len(images), 1)
test_eq(images[0]["alt_text"], "Iron man photo")Link Filtering
Utilities for classifying and filtering internal and external links.
filter_internal_links
def filter_internal_links(
urls:list, # List of URLs to filter
domain:str, # Domain to match against (e.g. 'example.com')
)->list:
Filter for internal links, excluding images and special URLs.
is_special_url
def is_special_url(
url:str, # URL to check
)->bool:
Check if URL is an anchor, mailto, tel, or javascript link.
filter_external_links
def filter_external_links(
urls:list, # List of URLs to filter
domain:str, # Site domain to exclude
)->list:
Filter for external links only, excluding images and special URLs.
Text Utilities
General-purpose text normalization and analysis helpers.
normalize_text
def normalize_text(
text:str, # Text to normalize
)->str:
Normalize text by collapsing extra whitespace.
detect_phone_numbers
def detect_phone_numbers(
text:str, # Text to search
)->list:
Extract phone numbers from text.
phones = detect_phone_numbers(content)
test_eq("+966503139675" in phones, True)calculate_similarity
def calculate_similarity(
text1:str, # First text
text2:str, # Second text
)->float:
Calculate similarity ratio between two texts using SequenceMatcher.
File Utilities
Helpers for finding and reading markdown and notebook files.
get_file_paths
def get_file_paths(
pattern:str, # Glob pattern (e.g. '**/*.md')
)->list:
Get file paths matching a glob pattern.
get_file_name
def get_file_name(
file_path:str, # Path to file
)->str:
Extract filename without extension from a path.
get_page_content
def get_page_content(
file_path:str, # Path to markdown or notebook file
is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->str:
Read a file and return its text content, stripping frontmatter.
get_markdown_files
def get_markdown_files(
directory:str, # Directory to search
)->list:
Get all markdown filenames (without extension) from a directory.
Arabic Slug Utilities
Helpers for converting Arabic filenames to URL-friendly slugs.
arabic_to_slug
def arabic_to_slug(
text:str, # Arabic text to convert
)->str:
Convert Arabic text to a URL-friendly slug.
map_files_to_slugs
def map_files_to_slugs(
directory:str, # Directory containing Arabic markdown files
)->dict:
Map markdown filenames to their URL slugs.