added py script for validation
This commit is contained in:
65
surnames.py
Normal file
65
surnames.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast DBLP Surname Extractor - Optimized for 4GB+ files
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def fast_extract_surnames(input_stream, min_count=10000):
|
||||
"""Memory-efficient surname extraction"""
|
||||
surname_counts = defaultdict(int)
|
||||
|
||||
# Compile regex for performance
|
||||
tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
|
||||
year_pattern = re.compile(r"\s+\d{4}$")
|
||||
entity_pattern = re.compile(r"&[^;]*;")
|
||||
|
||||
for line in input_stream:
|
||||
for match in tag_pattern.finditer(line):
|
||||
content = match.group(2).strip()
|
||||
|
||||
# Remove year suffix
|
||||
content = year_pattern.sub("", content)
|
||||
|
||||
# Get surname (last word)
|
||||
words = content.split()
|
||||
if words:
|
||||
surname = words[-1]
|
||||
# Clean entities
|
||||
surname = entity_pattern.sub("", surname)
|
||||
if surname:
|
||||
surname_counts[surname] += 1
|
||||
|
||||
# Filter and sort results
|
||||
results = [
|
||||
(surname, count)
|
||||
for surname, count in surname_counts.items()
|
||||
if count >= min_count
|
||||
]
|
||||
results.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
min_count = (
|
||||
int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
|
||||
)
|
||||
|
||||
if len(sys.argv) > 1 and not sys.argv[1].isdigit():
|
||||
# Read from file
|
||||
with open(sys.argv[1], "r", encoding="utf-8") as f:
|
||||
results = fast_extract_surnames(f, min_count)
|
||||
else:
|
||||
# Read from stdin
|
||||
results = fast_extract_surnames(sys.stdin, min_count)
|
||||
|
||||
for surname, count in results:
|
||||
print(f"{surname} {count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user