Content Parser

Utilities for parsing markdown/notebook metadata, extracting content, and validating SEO elements.

from fastcore.test import test_eq

metadata Parsing

Utilities for extracting frontmatter metadata from markdown and Jupyter notebook files.

source

parse_metadata


def parse_metadata(
    content:str, # Raw markdown content with YAML frontmatter
)->dict:

Extract metadata from YAML frontmatter.

source

parse_notebook_metadata


def parse_notebook_metadata(
    content:str, # Raw Jupyter notebook JSON string
)->dict:

Extract metadata from the first cell of a Jupyter notebook.

# Test Parse metadat
from pathlib import Path

sample_dir = Path("sample")
if not sample_dir.exists():
    sample_dir = Path("../sample")

with open(sample_dir / "example.md", "r") as file:
    content = file.read()

metadata = parse_metadata(content)

# content is .ipynb
with open(sample_dir / "design_questions.ipynb", "r") as f:
    nb_content = f.read()
nb_metadata = parse_notebook_metadata(nb_content)
print(nb_metadata)

{}

Notebook Parsing

Helpers for filtering and extracting content from Jupyter notebook cells.

source

is_frontmatter


def is_frontmatter(
    cell:dict, # Notebook cell dict
)->bool:

Check if a cell is a YAML frontmatter cell.

source

is_visible_code


def is_visible_code(
    cell:dict, # Notebook cell dict
    is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->bool:

Check if a code cell should be included in output.

source

extract_notebook_content


def extract_notebook_content(
    content:str, # Raw Jupyter notebook JSON string
    is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->str:

Extract visible markdown and code content from a Jupyter notebook.

source

remove_metadata


def remove_metadata(
    content:str
)->str:

Remove frontmatter from content

Content Checks

Functions for validating SEO-relevant content lengths for titles, descriptions, and body text.

source

check_content_length


def check_content_length(
    content:str, # Page body content
)->dict:

Check if page content has sufficient word count for SEO.

source

check_desc_length


def check_desc_length(
    description:str, # Meta description to check
    min_len:int=150, # Minimum optimal length
    max_len:int=160, # Maximum optimal length
)->dict:

Check if a meta description length is optimal for SEO.

source

check_title_length


def check_title_length(
    title:str, # Page title to check
    min_len:int=50, # Minimum optimal length
    max_len:int=60, # Maximum optimal length
)->dict:

Check if a page title length is optimal for SEO.

source

check_length


def check_length(
    text:str, # Text to check
    min_len:int, # Minimum optimal length
    max_len:int, # Maximum optimal length
)->dict:

Check if text length falls within the optimal range.

Content Extraction

Functions for extracting headers, links, and images from markdown and HTML content.

source

extract_headers


def extract_headers(
    file_path:str, # Path to markdown file
)->list:

Extract all headers with their level, line number, content, and length.

headers = extract_headers(str(sample_dir / "example.md"))
test_eq(len([h for h in headers if h["type"] == "h1"]), 2)
test_eq(headers[0]["content"], "This is me Kareem")

source

extract_links


def extract_links(
    content:str
)->dict:

Extract all links with metadata

links = extract_links(content)
test_eq("https://emdadelgaz.com" in links, True)
test_eq("https://awazly.com/" in links, True)

source

extract_images


def extract_images(
    content:str, # Markdown or HTML content
)->list:

Extract images with alt text from markdown and HTML.

source

imgs_missing_alts


def imgs_missing_alts(
    images:list, # List of image dicts from [`extract_images`](https://abdelkareemkobo.github.io/seo_rat/content_parser.html#extract_images)
)->list:

Return URLs of images missing alt text.

images = extract_images(content)
test_eq(len(images), 1)
test_eq(images[0]["alt_text"], "Iron man photo")

Link Filtering

Utilities for classifying and filtering internal and external links.

source

filter_internal_links


def filter_internal_links(
    urls:list, # List of URLs to filter
    domain:str, # Domain to match against (e.g. 'example.com')
)->list:

Filter for internal links, excluding images and special URLs.

source

is_special_url


def is_special_url(
    url:str, # URL to check
)->bool:

Check if URL is an anchor, mailto, tel, or javascript link.

source

filter_external_links


def filter_external_links(
    urls:list, # List of URLs to filter
    domain:str, # Site domain to exclude
)->list:

Filter for external links only, excluding images and special URLs.

Text Utilities

General-purpose text normalization and analysis helpers.

source

normalize_text


def normalize_text(
    text:str, # Text to normalize
)->str:

Normalize text by collapsing extra whitespace.

source

detect_phone_numbers


def detect_phone_numbers(
    text:str, # Text to search
)->list:

Extract phone numbers from text.

phones = detect_phone_numbers(content)
test_eq("+966503139675" in phones, True)

source

calculate_similarity


def calculate_similarity(
    text1:str, # First text
    text2:str, # Second text
)->float:

Calculate similarity ratio between two texts using SequenceMatcher.

File Utilities

Helpers for finding and reading markdown and notebook files.

source

get_file_paths


def get_file_paths(
    pattern:str, # Glob pattern (e.g. '**/*.md')
)->list:

Get file paths matching a glob pattern.

source

get_file_name


def get_file_name(
    file_path:str, # Path to file
)->str:

Extract filename without extension from a path.

source

get_page_content


def get_page_content(
    file_path:str, # Path to markdown or notebook file
    is_quarto:bool=False, # Whether the notebook is a Quarto doc
)->str:

Read a file and return its text content, stripping frontmatter.

source

get_markdown_files


def get_markdown_files(
    directory:str, # Directory to search
)->list:

Get all markdown filenames (without extension) from a directory.

Arabic Slug Utilities

Helpers for converting Arabic filenames to URL-friendly slugs.

source

arabic_to_slug


def arabic_to_slug(
    text:str, # Arabic text to convert
)->str:

Convert Arabic text to a URL-friendly slug.

source

map_files_to_slugs


def map_files_to_slugs(
    directory:str, # Directory containing Arabic markdown files
)->dict:

Map markdown filenames to their URL slugs.