deleted bash script for now

2025-06-13 04:57:23 +02:00
parent d9ba23af7b
commit 8f65b8c142
3 changed files with 69 additions and 66 deletions
--- a/surnames.c
+++ b/surnames.c
@@ -8,6 +8,10 @@
 #define HASH_BUCKETS 4000037
 #define MIN_COUNT 10000
 /*
 * Usage: pipe dblp.xml to the programm or have it in the same folder as the program
 */
 void string_ncopy(char *dest, const char *src, size_t max_len) {
    size_t i = 0;
    while (i < max_len - 1 && src[i]) {
@@ -170,30 +174,6 @@ void make_list(person **hashmap, person **list, const int min_count) {
    free(hashmap);
 }
 void verify_list(person *list, person *test) {
    person *p = list;
    int count = 0;
    while (p != NULL) {
        person *next = p->next;
        if (strcmp(p->name, test->name) == 0) {
            count++;
        }
        p = next;
    }
    if (count > 1) {
        printf("ERROR: %s\n", test->name);
    }
 }
 void check_list(person *list) {
    person *p = list;
    while (p != NULL) {
        person *next = p->next;
        verify_list(list, p);
        p = next;
    }
 }
 void clean_list(person *list) {
    person *p = list;
    while (p != NULL) {
@@ -246,7 +226,6 @@ int main(void) {
    free(buffer);
    person *list = NULL;
    make_list(hashmap, &list, MIN_COUNT);
    check_list(list);
    display(list);
    clean_list(list);
    return 0;
--- a/surnames_fast.py
+++ b/surnames_fast.py
@@ -0,0 +1,65 @@
 #!/usr/bin/env python3
 """
 Fast DBLP Surname Extractor - Optimized for 4GB+ files
 """
 import sys
 import re
 from collections import defaultdict
 def fast_extract_surnames(input_stream, min_count=10000):
    """Memory-efficient surname extraction"""
    surname_counts = defaultdict(int)
    # Compile regex for performance
    tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
    year_pattern = re.compile(r"\s+\d{4}$")
    entity_pattern = re.compile(r"&[^;]*;")
    for line in input_stream:
        for match in tag_pattern.finditer(line):
            content = match.group(2).strip()
            # Remove year suffix
            content = year_pattern.sub("", content)
            # Get surname (last word)
            words = content.split()
            if words:
                surname = words[-1]
                # Clean entities
                surname = entity_pattern.sub("", surname)
                if surname:
                    surname_counts[surname] += 1
    # Filter and sort results
    results = [
        (surname, count)
        for surname, count in surname_counts.items()
        if count >= min_count
    ]
    results.sort(key=lambda x: x[1], reverse=True)
    return results
 def main():
    min_count = (
        int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
    )
    if len(sys.argv) > 1 and not sys.argv[1].isdigit():
        # Read from file
        with open(sys.argv[1], "r", encoding="utf-8") as f:
            results = fast_extract_surnames(f, min_count)
    else:
        # Read from stdin
        results = fast_extract_surnames(sys.stdin, min_count)
    for surname, count in results:
        print(f"{surname} {count}")
 if __name__ == "__main__":
    main()
--- a/validate_extraction.sh
+++ b/validate_extraction.sh
@@ -1,41 +0,0 @@
 #!/bin/bash
 echo "Running full validation with working pipeline..."
 # Generate complete bash results using the proven pipeline
 echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
 {
    grep -E "<(author|editor)" dblp.xml | \
    sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
    awk '{
        if ($NF ~ /^[0-9]{4}$/) { NF-- }
        if (NF > 0) { 
            surname = $NF
            gsub(/&[^;]*;/, "", surname)
            if (length(surname) > 0) print surname 
        }
    }' | \
    sort | uniq -c | \
    awk '$1 >= 10000 {print $2, $1}' | \
    sort -k2 -nr
 } > bash_results.txt
 echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
 # Get your C program results
 ./main > c_results.txt
 echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
 # Quick comparison of top entries
 echo "Top 5 comparison:"
 echo "=== C Program ==="
 head -5 c_results.txt
 echo "=== Bash Script ==="
 head -5 bash_results.txt
 # Check Wang specifically
 echo "Wang comparison:"
 echo "C: $(grep "^Wang " c_results.txt)"
 echo "Bash: $(grep "^Wang " bash_results.txt)"