added py script for validation

This commit is contained in:
2025-06-13 04:58:34 +02:00
parent d35c253afa
commit b669c7135a

65
surnames.py Normal file
View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Fast DBLP Surname Extractor - Optimized for 4GB+ files
"""
import sys
import re
from collections import defaultdict
def fast_extract_surnames(input_stream, min_count=10000):
"""Memory-efficient surname extraction"""
surname_counts = defaultdict(int)
# Compile regex for performance
tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
year_pattern = re.compile(r"\s+\d{4}$")
entity_pattern = re.compile(r"&[^;]*;")
for line in input_stream:
for match in tag_pattern.finditer(line):
content = match.group(2).strip()
# Remove year suffix
content = year_pattern.sub("", content)
# Get surname (last word)
words = content.split()
if words:
surname = words[-1]
# Clean entities
surname = entity_pattern.sub("", surname)
if surname:
surname_counts[surname] += 1
# Filter and sort results
results = [
(surname, count)
for surname, count in surname_counts.items()
if count >= min_count
]
results.sort(key=lambda x: x[1], reverse=True)
return results
def main():
min_count = (
int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
)
if len(sys.argv) > 1 and not sys.argv[1].isdigit():
# Read from file
with open(sys.argv[1], "r", encoding="utf-8") as f:
results = fast_extract_surnames(f, min_count)
else:
# Read from stdin
results = fast_extract_surnames(sys.stdin, min_count)
for surname, count in results:
print(f"{surname} {count}")
if __name__ == "__main__":
main()