some small fixes

update gitignore
added new bash validation scripts
2025-06-13 05:51:53 +02:00 · 2025-06-13 05:05:52 +02:00 · 2025-06-13 05:05:33 +02:00 · 2025-06-13 04:58:34 +02:00 · 2025-06-13 04:57:30 +02:00 · 2025-06-13 04:57:23 +02:00
6 changed files with 257 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,6 @@ dkms.conf
 /bash_results.txt
 /c_results.txt
 /surnames
 /c_quick.txt
 /python_quick.txt
--- a/surnames.c
+++ b/surnames.c
@@ -8,6 +8,10 @@
 #define HASH_BUCKETS 4000037
 #define MIN_COUNT 10000
 /*
 * Usage: pipe dblp.xml to the program or have it in the same folder as the program
 */
 void string_ncopy(char *dest, const char *src, size_t max_len) {
    size_t i = 0;
    while (i < max_len - 1 && src[i]) {
@@ -23,10 +27,12 @@ typedef struct person {
    int count;
 } person;
-void newPerson(person *p, const char *name) {
+person* newPerson(const char *name) {
    person *p = (person *) malloc(sizeof(person));
    string_ncopy(p->name, name, BUFFER_LENGTH);
    p->count = 1;
    p->next = NULL;
    return p;
 }
 void sorted_name_insert(person **head, char *name) {
@@ -39,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
        p = p->next;
    }
-    person *node = (person *) malloc(sizeof(person));
+    person *node = newPerson(name);
    newPerson(node, name);
    if (*head == NULL || strcmp((*head)->name, name) > 0) {
        node->next = *head;
@@ -170,30 +175,6 @@ void make_list(person **hashmap, person **list, const int min_count) {
    free(hashmap);
 }
 void verify_list(person *list, person *test) {
    person *p = list;
    int count = 0;
    while (p != NULL) {
        person *next = p->next;
        if (strcmp(p->name, test->name) == 0) {
            count++;
        }
        p = next;
    }
    if (count > 1) {
        printf("ERROR: %s\n", test->name);
    }
 }
 void check_list(person *list) {
    person *p = list;
    while (p != NULL) {
        person *next = p->next;
        verify_list(list, p);
        p = next;
    }
 }
 void clean_list(person *list) {
    person *p = list;
    while (p != NULL) {
@@ -246,7 +227,6 @@ int main(void) {
    free(buffer);
    person *list = NULL;
    make_list(hashmap, &list, MIN_COUNT);
    check_list(list);
    display(list);
    clean_list(list);
    return 0;
--- a/surnames.py
+++ b/surnames.py
@@ -0,0 +1,163 @@
 #!/usr/bin/env python3
 """
 DBLP Surname Extractor
 Replicates C program logic for surname frequency analysis
 Usage: gunzip -c dblp.xml.gz | python surnames.py
       python surnames.py dblp.xml
 """
 import re
 import sys
 import xml.etree.ElementTree as ET
 from collections import Counter
 def extract_surname(name_text):
    """Extract surname using same logic as C program"""
    if not name_text:
        return None
    # Split into words
    words = name_text.strip().split()
    if not words:
        return None
    # Remove 4-digit year if present at end
    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
        words = words[:-1]
    if not words:
        return None
    # Return last word as surname
    surname = words[-1]
    # Clean up XML entities (basic cleanup)
    surname = re.sub(r"&[^;]*;", "", surname)
    return surname if surname else None
 def process_xml_stream(input_stream, min_count=10000):
    """Process XML from stream (for piping)"""
    surname_counter = Counter()
    # Read and process line by line for memory efficiency
    current_element = ""
    in_author_or_editor = False
    tag_name = ""
    for line in input_stream:
        line = line.strip()
        # Check for author or editor tags
        author_match = re.search(
            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
        )
        if author_match:
            tag_name = author_match.group(1)
            content = author_match.group(2)
            surname = extract_surname(content)
            if surname:
                surname_counter[surname] += 1
        else:
            # Handle multi-line tags
            if re.search(r"<(author|editor)", line):
                in_author_or_editor = True
                tag_match = re.search(r"<(author|editor)", line)
                tag_name = tag_match.group(1)
                current_element = line
            elif in_author_or_editor:
                current_element += " " + line
                if f"</{tag_name}>" in line:
                    # Extract content between tags
                    content_match = re.search(
                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
                    )
                    if content_match:
                        content = content_match.group(1)
                        surname = extract_surname(content)
                        if surname:
                            surname_counter[surname] += 1
                    in_author_or_editor = False
                    current_element = ""
    # Return surnames above threshold, sorted by count
    return [
        (surname, count)
        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]
 def process_xml_file(filename, min_count=10000):
    """Process XML file using ElementTree (more robust)"""
    surname_counter = Counter()
    try:
        # Parse XML incrementally for memory efficiency
        context = ET.iterparse(filename, events=("start", "end"))
        context = iter(context)
        event, root = next(context)
        for event, elem in context:
            if event == "end" and elem.tag in ["author", "editor"]:
                if elem.text:
                    surname = extract_surname(elem.text)
                    if surname:
                        surname_counter[surname] += 1
                elem.clear()  # Free memory
    except ET.ParseError:
        # Fallback to line-by-line processing
        print(
            "XML parsing failed, falling back to regex processing...", file=sys.stderr
        )
        with open(filename, "r", encoding="utf-8") as f:
            return process_xml_stream(f, min_count)
    return [
        (surname, count)
        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]
 def main():
    min_count = 10000
    # Handle command line arguments
    if len(sys.argv) > 1:
        if sys.argv[1] in ["-h", "--help"]:
            print(__doc__)
            sys.exit(0)
        elif sys.argv[1].isdigit():
            min_count = int(sys.argv[1])
            filename = sys.argv[2] if len(sys.argv) > 2 else None
        else:
            filename = sys.argv[1]
            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
    else:
        filename = None
    try:
        if filename:
            # Read from file
            results = process_xml_file(filename, min_count)
        else:
            # Read from stdin (piped input)
            results = process_xml_stream(sys.stdin, min_count)
        # Output results in same format as C program
        for surname, count in results:
            print(f"{surname} {count}")
    except KeyboardInterrupt:
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/surnames_quick_validation.sh
+++ b/surnames_quick_validation.sh
@@ -0,0 +1,38 @@
 #!/bin/bash
 # quick_validation.sh - Simplified version for rapid testing
 echo "=== Quick Surname Validation ==="
 # Quick output comparison
 echo "Generating results..."
 gunzip -c dblp.xml.gz | ./surnames > c_quick.txt &
 C_PID=$!
 gunzip -c dblp.xml.gz | python3 surnames.py > python_quick.txt &
 PYTHON_PID=$!
 wait $C_PID $PYTHON_PID
 echo "C results: $(wc -l < c_quick.txt) surnames"
 echo "Python results: $(wc -l < python_quick.txt) surnames"
 echo "Wang comparison:"
 echo "C: $(grep "^Wang " c_quick.txt)"
 echo "Python: $(grep "^Wang " python_quick.txt)"
 if diff -q c_quick.txt python_quick.txt > /dev/null; then
    echo "✓ Results identical!"
 else
    echo "⚠ Results differ"
 fi
 # Quick memory check
 echo "Memory check:"
 gunzip -c dblp.xml.gz | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
 if [ $? -eq 0 ]; then
    echo "✓ No major memory issues"
 else
    echo "⚠ Check valgrind output"
 fi
 rm -f c_quick.txt python_quick.txt
--- a/surnames_validation.sh
+++ b/surnames_validation.sh
@@ -0,0 +1,45 @@
 #!/bin/bash
 # simple_surnames_validation.sh - Robust and simple
 DBLP_FILE="${1:-dblp.xml.gz}"
 TEMP_DIR="/tmp/surnames_val_$$"
 echo "=== Simple Surname Validation ==="
 echo "Input: $DBLP_FILE"
 mkdir -p "$TEMP_DIR"
 # Sequential execution to avoid pipe issues
 echo "Running C program..."
 gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
 c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
 c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
 echo "Running Python validation..."
 gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
 python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
 python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
 echo "Results:"
 echo "  C program: $c_count surnames"
 echo "  Python script: $python_count surnames"
 echo "  C top result: $c_wang"
 echo "  Python top result: $python_wang"
 if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
    echo "✓ Results are identical!"
 else
    echo "⚠ Results differ - checking details..."
    diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
 fi
 echo "Memory check..."
 echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
 if [ $? -eq 0 ]; then
    echo "✓ No memory leaks detected"
 fi
 # Cleanup
 rm -rf "$TEMP_DIR"
 echo "Validation complete!"
--- a/validate_extraction.sh
+++ b/validate_extraction.sh
@@ -1,41 +0,0 @@
 #!/bin/bash
 echo "Running full validation with working pipeline..."
 # Generate complete bash results using the proven pipeline
 echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
 {
    grep -E "<(author|editor)" dblp.xml | \
    sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
    awk '{
        if ($NF ~ /^[0-9]{4}$/) { NF-- }
        if (NF > 0) { 
            surname = $NF
            gsub(/&[^;]*;/, "", surname)
            if (length(surname) > 0) print surname 
        }
    }' | \
    sort | uniq -c | \
    awk '$1 >= 10000 {print $2, $1}' | \
    sort -k2 -nr
 } > bash_results.txt
 echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
 # Get your C program results
 ./main > c_results.txt
 echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
 # Quick comparison of top entries
 echo "Top 5 comparison:"
 echo "=== C Program ==="
 head -5 c_results.txt
 echo "=== Bash Script ==="
 head -5 bash_results.txt
 # Check Wang specifically
 echo "Wang comparison:"
 echo "C: $(grep "^Wang " c_results.txt)"
 echo "Bash: $(grep "^Wang " bash_results.txt)"
Author	SHA1	Message	Date
Matthias Puchstein	8e9cd18fa6	some small fixes	2025-06-13 05:51:53 +02:00
Matthias Puchstein	aa6cde55f4	update gitignore	2025-06-13 05:05:52 +02:00
Matthias Puchstein	d4d3dca574	added new bash validation scripts	2025-06-13 05:05:33 +02:00
Matthias Puchstein	b669c7135a	added py script for validation	2025-06-13 04:58:34 +02:00
Matthias Puchstein	d35c253afa	set min count again to 10000	2025-06-13 04:57:30 +02:00
Matthias Puchstein	8f65b8c142	deleted bash script for now	2025-06-13 04:57:23 +02:00