some small fixes

2025-06-13 05:51:53 +02:00
parent aa6cde55f4
commit 8e9cd18fa6
3 changed files with 170 additions and 202 deletions
--- a/surnames.py
+++ b/surnames.py
@@ -1,64 +1,162 @@
 #!/usr/bin/env python3
 """
-Fast DBLP Surname Extractor - Optimized for 4GB+ files
+DBLP Surname Extractor
+Replicates C program logic for surname frequency analysis
+Usage: gunzip -c dblp.xml.gz | python surnames.py
+       python surnames.py dblp.xml
 """

-import sys
 import re
-from collections import defaultdict
+import sys
+import xml.etree.ElementTree as ET
+from collections import Counter


-def fast_extract_surnames(input_stream, min_count=10000):
-    """Memory-efficient surname extraction"""
-    surname_counts = defaultdict(int)
+def extract_surname(name_text):
+    """Extract surname using same logic as C program"""
+    if not name_text:
+        return None

-    # Compile regex for performance
-    tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
-    year_pattern = re.compile(r"\s+\d{4}$")
-    entity_pattern = re.compile(r"&[^;]*;")
+    # Split into words
+    words = name_text.strip().split()
+    if not words:
+        return None
+
+    # Remove 4-digit year if present at end
+    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
+        words = words[:-1]
+
+    if not words:
+        return None
+
+    # Return last word as surname
+    surname = words[-1]
+
+    # Clean up XML entities (basic cleanup)
+    surname = re.sub(r"&[^;]*;", "", surname)
+
+    return surname if surname else None
+
+
+def process_xml_stream(input_stream, min_count=10000):
+    """Process XML from stream (for piping)"""
+    surname_counter = Counter()
+
+    # Read and process line by line for memory efficiency
+    current_element = ""
+    in_author_or_editor = False
+    tag_name = ""

    for line in input_stream:
-        for match in tag_pattern.finditer(line):
-            content = match.group(2).strip()
+        line = line.strip()

-            # Remove year suffix
-            content = year_pattern.sub("", content)
+        # Check for author or editor tags
+        author_match = re.search(
+            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
+        )
+        if author_match:
+            tag_name = author_match.group(1)
+            content = author_match.group(2)
+            surname = extract_surname(content)
+            if surname:
+                surname_counter[surname] += 1
+        else:
+            # Handle multi-line tags
+            if re.search(r"<(author|editor)", line):
+                in_author_or_editor = True
+                tag_match = re.search(r"<(author|editor)", line)
+                tag_name = tag_match.group(1)
+                current_element = line
+            elif in_author_or_editor:
+                current_element += " " + line
+                if f"</{tag_name}>" in line:
+                    # Extract content between tags
+                    content_match = re.search(
+                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
+                    )
+                    if content_match:
+                        content = content_match.group(1)
+                        surname = extract_surname(content)
+                        if surname:
+                            surname_counter[surname] += 1
+                    in_author_or_editor = False
+                    current_element = ""

-            # Get surname (last word)
-            words = content.split()
-            if words:
-                surname = words[-1]
-                # Clean entities
-                surname = entity_pattern.sub("", surname)
-                if surname:
-                    surname_counts[surname] += 1
-
-    # Filter and sort results
-    results = [
+    # Return surnames above threshold, sorted by count
+    return [
        (surname, count)
-        for surname, count in surname_counts.items()
+        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]
-    results.sort(key=lambda x: x[1], reverse=True)

-    return results
+
+def process_xml_file(filename, min_count=10000):
+    """Process XML file using ElementTree (more robust)"""
+    surname_counter = Counter()
+
+    try:
+        # Parse XML incrementally for memory efficiency
+        context = ET.iterparse(filename, events=("start", "end"))
+        context = iter(context)
+        event, root = next(context)
+
+        for event, elem in context:
+            if event == "end" and elem.tag in ["author", "editor"]:
+                if elem.text:
+                    surname = extract_surname(elem.text)
+                    if surname:
+                        surname_counter[surname] += 1
+                elem.clear()  # Free memory
+
+    except ET.ParseError:
+        # Fallback to line-by-line processing
+        print(
+            "XML parsing failed, falling back to regex processing...", file=sys.stderr
+        )
+        with open(filename, "r", encoding="utf-8") as f:
+            return process_xml_stream(f, min_count)
+
+    return [
+        (surname, count)
+        for surname, count in surname_counter.most_common()
+        if count >= min_count
+    ]


 def main():
-    min_count = (
-        int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
-    )
+    min_count = 10000

-    if len(sys.argv) > 1 and not sys.argv[1].isdigit():
-        # Read from file
-        with open(sys.argv[1], "r", encoding="utf-8") as f:
-            results = fast_extract_surnames(f, min_count)
+    # Handle command line arguments
+    if len(sys.argv) > 1:
+        if sys.argv[1] in ["-h", "--help"]:
+            print(__doc__)
+            sys.exit(0)
+        elif sys.argv[1].isdigit():
+            min_count = int(sys.argv[1])
+            filename = sys.argv[2] if len(sys.argv) > 2 else None
+        else:
+            filename = sys.argv[1]
+            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
    else:
-        # Read from stdin
-        results = fast_extract_surnames(sys.stdin, min_count)
+        filename = None

-    for surname, count in results:
-        print(f"{surname} {count}")
+    try:
+        if filename:
+            # Read from file
+            results = process_xml_file(filename, min_count)
+        else:
+            # Read from stdin (piped input)
+            results = process_xml_stream(sys.stdin, min_count)
+
+        # Output results in same format as C program
+        for surname, count in results:
+            print(f"{surname} {count}")
+
+    except KeyboardInterrupt:
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)


 if __name__ == "__main__":