#!/usr/bin/env python3 """ DBLP Surname Extractor Replicates C program logic for surname frequency analysis Usage: gunzip -c dblp.xml.gz | python surnames.py python surnames.py dblp.xml """ import re import sys import xml.etree.ElementTree as ET from collections import Counter def extract_surname(name_text): """Extract surname using same logic as C program""" if not name_text: return None # Split into words words = name_text.strip().split() if not words: return None # Remove 4-digit year if present at end if len(words) > 1 and re.match(r"^\d{4}$", words[-1]): words = words[:-1] if not words: return None # Return last word as surname surname = words[-1] # Clean up XML entities (basic cleanup) surname = re.sub(r"&[^;]*;", "", surname) return surname if surname else None def process_xml_stream(input_stream, min_count=10000): """Process XML from stream (for piping)""" surname_counter = Counter() # Read and process line by line for memory efficiency current_element = "" in_author_or_editor = False tag_name = "" for line in input_stream: line = line.strip() # Check for author or editor tags author_match = re.search( r"<(author|editor)[^>]*>([^<]+)", line ) if author_match: tag_name = author_match.group(1) content = author_match.group(2) surname = extract_surname(content) if surname: surname_counter[surname] += 1 else: # Handle multi-line tags if re.search(r"<(author|editor)", line): in_author_or_editor = True tag_match = re.search(r"<(author|editor)", line) tag_name = tag_match.group(1) current_element = line elif in_author_or_editor: current_element += " " + line if f"" in line: # Extract content between tags content_match = re.search( rf"<{tag_name}[^>]*>([^<]+)", current_element ) if content_match: content = content_match.group(1) surname = extract_surname(content) if surname: surname_counter[surname] += 1 in_author_or_editor = False current_element = "" # Return surnames above threshold, sorted by count return [ (surname, count) for surname, count in surname_counter.most_common() if count >= min_count ] def process_xml_file(filename, min_count=10000): """Process XML file using ElementTree (more robust)""" surname_counter = Counter() try: # Parse XML incrementally for memory efficiency context = ET.iterparse(filename, events=("start", "end")) context = iter(context) event, root = next(context) for event, elem in context: if event == "end" and elem.tag in ["author", "editor"]: if elem.text: surname = extract_surname(elem.text) if surname: surname_counter[surname] += 1 elem.clear() # Free memory except ET.ParseError: # Fallback to line-by-line processing print( "XML parsing failed, falling back to regex processing...", file=sys.stderr ) with open(filename, "r", encoding="utf-8") as f: return process_xml_stream(f, min_count) return [ (surname, count) for surname, count in surname_counter.most_common() if count >= min_count ] def main(): min_count = 10000 # Handle command line arguments if len(sys.argv) > 1: if sys.argv[1] in ["-h", "--help"]: print(__doc__) sys.exit(0) elif sys.argv[1].isdigit(): min_count = int(sys.argv[1]) filename = sys.argv[2] if len(sys.argv) > 2 else None else: filename = sys.argv[1] min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000 else: filename = None try: if filename: # Read from file results = process_xml_file(filename, min_count) else: # Read from stdin (piped input) results = process_xml_stream(sys.stdin, min_count) # Output results in same format as C program for surname, count in results: print(f"{surname} {count}") except KeyboardInterrupt: sys.exit(1) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()