c_prog/surnames.py

#!/usr/bin/env python3
"""
DBLP Surname Extractor
Replicates C program logic for surname frequency analysis
Usage: gunzip -c dblp.xml.gz | python surnames.py
       python surnames.py dblp.xml
"""

import re
import sys
import xml.etree.ElementTree as ET
from collections import Counter


def extract_surname(name_text):
    """Extract surname using same logic as C program"""
    if not name_text:
        return None

    # Split into words
    words = name_text.strip().split()
    if not words:
        return None

    # Remove 4-digit year if present at end
    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
        words = words[:-1]

    if not words:
        return None

    # Return last word as surname
    surname = words[-1]

    # Clean up XML entities (basic cleanup)
    surname = re.sub(r"&[^;]*;", "", surname)

    return surname if surname else None


def process_xml_stream(input_stream, min_count=10000):
    """Process XML from stream (for piping)"""
    surname_counter = Counter()

    # Read and process line by line for memory efficiency
    current_element = ""
    in_author_or_editor = False
    tag_name = ""

    for line in input_stream:
        line = line.strip()

        # Check for author or editor tags
        author_match = re.search(
            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
        )
        if author_match:
            tag_name = author_match.group(1)
            content = author_match.group(2)
            surname = extract_surname(content)
            if surname:
                surname_counter[surname] += 1
        else:
            # Handle multi-line tags
            if re.search(r"<(author|editor)", line):
                in_author_or_editor = True
                tag_match = re.search(r"<(author|editor)", line)
                tag_name = tag_match.group(1)
                current_element = line
            elif in_author_or_editor:
                current_element += " " + line
                if f"</{tag_name}>" in line:
                    # Extract content between tags
                    content_match = re.search(
                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
                    )
                    if content_match:
                        content = content_match.group(1)
                        surname = extract_surname(content)
                        if surname:
                            surname_counter[surname] += 1
                    in_author_or_editor = False
                    current_element = ""

    # Return surnames above threshold, sorted by count
    return [
        (surname, count)
        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]


def process_xml_file(filename, min_count=10000):
    """Process XML file using ElementTree (more robust)"""
    surname_counter = Counter()

    try:
        # Parse XML incrementally for memory efficiency
        context = ET.iterparse(filename, events=("start", "end"))
        context = iter(context)
        event, root = next(context)

        for event, elem in context:
            if event == "end" and elem.tag in ["author", "editor"]:
                if elem.text:
                    surname = extract_surname(elem.text)
                    if surname:
                        surname_counter[surname] += 1
                elem.clear()  # Free memory

    except ET.ParseError:
        # Fallback to line-by-line processing
        print(
            "XML parsing failed, falling back to regex processing...", file=sys.stderr
        )
        with open(filename, "r", encoding="utf-8") as f:
            return process_xml_stream(f, min_count)

    return [
        (surname, count)
        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]


def main():
    min_count = 10000

    # Handle command line arguments
    if len(sys.argv) > 1:
        if sys.argv[1] in ["-h", "--help"]:
            print(__doc__)
            sys.exit(0)
        elif sys.argv[1].isdigit():
            min_count = int(sys.argv[1])
            filename = sys.argv[2] if len(sys.argv) > 2 else None
        else:
            filename = sys.argv[1]
            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
    else:
        filename = None

    try:
        if filename:
            # Read from file
            results = process_xml_file(filename, min_count)
        else:
            # Read from stdin (piped input)
            results = process_xml_stream(sys.stdin, min_count)

        # Output results in same format as C program
        for surname, count in results:
            print(f"{surname} {count}")

    except KeyboardInterrupt:
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()