164 lines
4.8 KiB
Python
164 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
DBLP Surname Extractor
|
|
Replicates C program logic for surname frequency analysis
|
|
Usage: gunzip -c dblp.xml.gz | python surnames.py
|
|
python surnames.py dblp.xml
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
from collections import Counter
|
|
|
|
|
|
def extract_surname(name_text):
|
|
"""Extract surname using same logic as C program"""
|
|
if not name_text:
|
|
return None
|
|
|
|
# Split into words
|
|
words = name_text.strip().split()
|
|
if not words:
|
|
return None
|
|
|
|
# Remove 4-digit year if present at end
|
|
if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
|
|
words = words[:-1]
|
|
|
|
if not words:
|
|
return None
|
|
|
|
# Return last word as surname
|
|
surname = words[-1]
|
|
|
|
# Clean up XML entities (basic cleanup)
|
|
surname = re.sub(r"&[^;]*;", "", surname)
|
|
|
|
return surname if surname else None
|
|
|
|
|
|
def process_xml_stream(input_stream, min_count=10000):
|
|
"""Process XML from stream (for piping)"""
|
|
surname_counter = Counter()
|
|
|
|
# Read and process line by line for memory efficiency
|
|
current_element = ""
|
|
in_author_or_editor = False
|
|
tag_name = ""
|
|
|
|
for line in input_stream:
|
|
line = line.strip()
|
|
|
|
# Check for author or editor tags
|
|
author_match = re.search(
|
|
r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
|
|
)
|
|
if author_match:
|
|
tag_name = author_match.group(1)
|
|
content = author_match.group(2)
|
|
surname = extract_surname(content)
|
|
if surname:
|
|
surname_counter[surname] += 1
|
|
else:
|
|
# Handle multi-line tags
|
|
if re.search(r"<(author|editor)", line):
|
|
in_author_or_editor = True
|
|
tag_match = re.search(r"<(author|editor)", line)
|
|
tag_name = tag_match.group(1)
|
|
current_element = line
|
|
elif in_author_or_editor:
|
|
current_element += " " + line
|
|
if f"</{tag_name}>" in line:
|
|
# Extract content between tags
|
|
content_match = re.search(
|
|
rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
|
|
)
|
|
if content_match:
|
|
content = content_match.group(1)
|
|
surname = extract_surname(content)
|
|
if surname:
|
|
surname_counter[surname] += 1
|
|
in_author_or_editor = False
|
|
current_element = ""
|
|
|
|
# Return surnames above threshold, sorted by count
|
|
return [
|
|
(surname, count)
|
|
for surname, count in surname_counter.most_common()
|
|
if count >= min_count
|
|
]
|
|
|
|
|
|
def process_xml_file(filename, min_count=10000):
|
|
"""Process XML file using ElementTree (more robust)"""
|
|
surname_counter = Counter()
|
|
|
|
try:
|
|
# Parse XML incrementally for memory efficiency
|
|
context = ET.iterparse(filename, events=("start", "end"))
|
|
context = iter(context)
|
|
event, root = next(context)
|
|
|
|
for event, elem in context:
|
|
if event == "end" and elem.tag in ["author", "editor"]:
|
|
if elem.text:
|
|
surname = extract_surname(elem.text)
|
|
if surname:
|
|
surname_counter[surname] += 1
|
|
elem.clear() # Free memory
|
|
|
|
except ET.ParseError:
|
|
# Fallback to line-by-line processing
|
|
print(
|
|
"XML parsing failed, falling back to regex processing...", file=sys.stderr
|
|
)
|
|
with open(filename, "r", encoding="utf-8") as f:
|
|
return process_xml_stream(f, min_count)
|
|
|
|
return [
|
|
(surname, count)
|
|
for surname, count in surname_counter.most_common()
|
|
if count >= min_count
|
|
]
|
|
|
|
|
|
def main():
|
|
min_count = 10000
|
|
|
|
# Handle command line arguments
|
|
if len(sys.argv) > 1:
|
|
if sys.argv[1] in ["-h", "--help"]:
|
|
print(__doc__)
|
|
sys.exit(0)
|
|
elif sys.argv[1].isdigit():
|
|
min_count = int(sys.argv[1])
|
|
filename = sys.argv[2] if len(sys.argv) > 2 else None
|
|
else:
|
|
filename = sys.argv[1]
|
|
min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
|
|
else:
|
|
filename = None
|
|
|
|
try:
|
|
if filename:
|
|
# Read from file
|
|
results = process_xml_file(filename, min_count)
|
|
else:
|
|
# Read from stdin (piped input)
|
|
results = process_xml_stream(sys.stdin, min_count)
|
|
|
|
# Output results in same format as C program
|
|
for surname, count in results:
|
|
print(f"{surname} {count}")
|
|
|
|
except KeyboardInterrupt:
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|