From 8f65b8c14212e0cd0442f3018721a0a355bc9059 Mon Sep 17 00:00:00 2001 From: Matthias Puchstein Date: Fri, 13 Jun 2025 04:57:23 +0200 Subject: [PATCH] deleted bash script for now --- main.c => surnames.c | 29 +++---------------- surnames_fast.py | 65 ++++++++++++++++++++++++++++++++++++++++++ validate_extraction.sh | 41 -------------------------- 3 files changed, 69 insertions(+), 66 deletions(-) rename main.c => surnames.c (92%) create mode 100644 surnames_fast.py delete mode 100755 validate_extraction.sh diff --git a/main.c b/surnames.c similarity index 92% rename from main.c rename to surnames.c index b2c91b5..dc21ee0 100644 --- a/main.c +++ b/surnames.c @@ -8,6 +8,10 @@ #define HASH_BUCKETS 4000037 #define MIN_COUNT 10000 +/* + * Usage: pipe dblp.xml to the programm or have it in the same folder as the program + */ + void string_ncopy(char *dest, const char *src, size_t max_len) { size_t i = 0; while (i < max_len - 1 && src[i]) { @@ -170,30 +174,6 @@ void make_list(person **hashmap, person **list, const int min_count) { free(hashmap); } -void verify_list(person *list, person *test) { - person *p = list; - int count = 0; - while (p != NULL) { - person *next = p->next; - if (strcmp(p->name, test->name) == 0) { - count++; - } - p = next; - } - if (count > 1) { - printf("ERROR: %s\n", test->name); - } -} - -void check_list(person *list) { - person *p = list; - while (p != NULL) { - person *next = p->next; - verify_list(list, p); - p = next; - } -} - void clean_list(person *list) { person *p = list; while (p != NULL) { @@ -246,7 +226,6 @@ int main(void) { free(buffer); person *list = NULL; make_list(hashmap, &list, MIN_COUNT); - check_list(list); display(list); clean_list(list); return 0; diff --git a/surnames_fast.py b/surnames_fast.py new file mode 100644 index 0000000..e5b48c1 --- /dev/null +++ b/surnames_fast.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +Fast DBLP Surname Extractor - Optimized for 4GB+ files +""" + +import sys +import re +from collections import defaultdict + + +def fast_extract_surnames(input_stream, min_count=10000): + """Memory-efficient surname extraction""" + surname_counts = defaultdict(int) + + # Compile regex for performance + tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)") + year_pattern = re.compile(r"\s+\d{4}$") + entity_pattern = re.compile(r"&[^;]*;") + + for line in input_stream: + for match in tag_pattern.finditer(line): + content = match.group(2).strip() + + # Remove year suffix + content = year_pattern.sub("", content) + + # Get surname (last word) + words = content.split() + if words: + surname = words[-1] + # Clean entities + surname = entity_pattern.sub("", surname) + if surname: + surname_counts[surname] += 1 + + # Filter and sort results + results = [ + (surname, count) + for surname, count in surname_counts.items() + if count >= min_count + ] + results.sort(key=lambda x: x[1], reverse=True) + + return results + + +def main(): + min_count = ( + int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000 + ) + + if len(sys.argv) > 1 and not sys.argv[1].isdigit(): + # Read from file + with open(sys.argv[1], "r", encoding="utf-8") as f: + results = fast_extract_surnames(f, min_count) + else: + # Read from stdin + results = fast_extract_surnames(sys.stdin, min_count) + + for surname, count in results: + print(f"{surname} {count}") + + +if __name__ == "__main__": + main() diff --git a/validate_extraction.sh b/validate_extraction.sh deleted file mode 100755 index c40da3b..0000000 --- a/validate_extraction.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -echo "Running full validation with working pipeline..." - -# Generate complete bash results using the proven pipeline -echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..." -{ - grep -E "<(author|editor)" dblp.xml | \ - sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \ - awk '{ - if ($NF ~ /^[0-9]{4}$/) { NF-- } - if (NF > 0) { - surname = $NF - gsub(/&[^;]*;/, "", surname) - if (length(surname) > 0) print surname - } - }' | \ - sort | uniq -c | \ - awk '$1 >= 10000 {print $2, $1}' | \ - sort -k2 -nr -} > bash_results.txt - -echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames" - -# Get your C program results -./main > c_results.txt - -echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames" - -# Quick comparison of top entries -echo "Top 5 comparison:" -echo "=== C Program ===" -head -5 c_results.txt -echo "=== Bash Script ===" -head -5 bash_results.txt - -# Check Wang specifically -echo "Wang comparison:" -echo "C: $(grep "^Wang " c_results.txt)" -echo "Bash: $(grep "^Wang " bash_results.txt)" -