some small fixes

update gitignore
added new bash validation scripts
2025-06-13 05:51:53 +02:00 · 2025-06-13 05:05:52 +02:00 · 2025-06-13 05:05:33 +02:00 · 2025-06-13 04:58:34 +02:00 · 2025-06-13 04:57:30 +02:00 · 2025-06-13 04:57:23 +02:00
6 changed files with 257 additions and 69 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,6 @@ dkms.conf

 /bash_results.txt
 /c_results.txt
+/surnames
+/c_quick.txt
+/python_quick.txt
--- a/surnames.c
+++ b/surnames.c
@@ -8,6 +8,10 @@
 #define HASH_BUCKETS 4000037
 #define MIN_COUNT 10000

+/*
+ * Usage: pipe dblp.xml to the program or have it in the same folder as the program
+ */
+
 void string_ncopy(char *dest, const char *src, size_t max_len) {
    size_t i = 0;
    while (i < max_len - 1 && src[i]) {
@@ -23,10 +27,12 @@ typedef struct person {
    int count;
 } person;

-void newPerson(person *p, const char *name) {
+person* newPerson(const char *name) {
+    person *p = (person *) malloc(sizeof(person));
    string_ncopy(p->name, name, BUFFER_LENGTH);
    p->count = 1;
    p->next = NULL;
+    return p;
 }

 void sorted_name_insert(person **head, char *name) {
@@ -39,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
        p = p->next;
    }

-    person *node = (person *) malloc(sizeof(person));
-    newPerson(node, name);
+    person *node = newPerson(name);

    if (*head == NULL || strcmp((*head)->name, name) > 0) {
        node->next = *head;
@@ -170,30 +175,6 @@ void make_list(person **hashmap, person **list, const int min_count) {
    free(hashmap);
 }

-void verify_list(person *list, person *test) {
-    person *p = list;
-    int count = 0;
-    while (p != NULL) {
-        person *next = p->next;
-        if (strcmp(p->name, test->name) == 0) {
-            count++;
-        }
-        p = next;
-    }
-    if (count > 1) {
-        printf("ERROR: %s\n", test->name);
-    }
-}
-
-void check_list(person *list) {
-    person *p = list;
-    while (p != NULL) {
-        person *next = p->next;
-        verify_list(list, p);
-        p = next;
-    }
-}
-
 void clean_list(person *list) {
    person *p = list;
    while (p != NULL) {
@@ -246,7 +227,6 @@ int main(void) {
    free(buffer);
    person *list = NULL;
    make_list(hashmap, &list, MIN_COUNT);
-    check_list(list);
    display(list);
    clean_list(list);
    return 0;
--- a/surnames.py
+++ b/surnames.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+DBLP Surname Extractor
+Replicates C program logic for surname frequency analysis
+Usage: gunzip -c dblp.xml.gz | python surnames.py
+       python surnames.py dblp.xml
+"""
+
+import re
+import sys
+import xml.etree.ElementTree as ET
+from collections import Counter
+
+
+def extract_surname(name_text):
+    """Extract surname using same logic as C program"""
+    if not name_text:
+        return None
+
+    # Split into words
+    words = name_text.strip().split()
+    if not words:
+        return None
+
+    # Remove 4-digit year if present at end
+    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
+        words = words[:-1]
+
+    if not words:
+        return None
+
+    # Return last word as surname
+    surname = words[-1]
+
+    # Clean up XML entities (basic cleanup)
+    surname = re.sub(r"&[^;]*;", "", surname)
+
+    return surname if surname else None
+
+
+def process_xml_stream(input_stream, min_count=10000):
+    """Process XML from stream (for piping)"""
+    surname_counter = Counter()
+
+    # Read and process line by line for memory efficiency
+    current_element = ""
+    in_author_or_editor = False
+    tag_name = ""
+
+    for line in input_stream:
+        line = line.strip()
+
+        # Check for author or editor tags
+        author_match = re.search(
+            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
+        )
+        if author_match:
+            tag_name = author_match.group(1)
+            content = author_match.group(2)
+            surname = extract_surname(content)
+            if surname:
+                surname_counter[surname] += 1
+        else:
+            # Handle multi-line tags
+            if re.search(r"<(author|editor)", line):
+                in_author_or_editor = True
+                tag_match = re.search(r"<(author|editor)", line)
+                tag_name = tag_match.group(1)
+                current_element = line
+            elif in_author_or_editor:
+                current_element += " " + line
+                if f"</{tag_name}>" in line:
+                    # Extract content between tags
+                    content_match = re.search(
+                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
+                    )
+                    if content_match:
+                        content = content_match.group(1)
+                        surname = extract_surname(content)
+                        if surname:
+                            surname_counter[surname] += 1
+                    in_author_or_editor = False
+                    current_element = ""
+
+    # Return surnames above threshold, sorted by count
+    return [
+        (surname, count)
+        for surname, count in surname_counter.most_common()
+        if count >= min_count
+    ]
+
+
+def process_xml_file(filename, min_count=10000):
+    """Process XML file using ElementTree (more robust)"""
+    surname_counter = Counter()
+
+    try:
+        # Parse XML incrementally for memory efficiency
+        context = ET.iterparse(filename, events=("start", "end"))
+        context = iter(context)
+        event, root = next(context)
+
+        for event, elem in context:
+            if event == "end" and elem.tag in ["author", "editor"]:
+                if elem.text:
+                    surname = extract_surname(elem.text)
+                    if surname:
+                        surname_counter[surname] += 1
+                elem.clear()  # Free memory
+
+    except ET.ParseError:
+        # Fallback to line-by-line processing
+        print(
+            "XML parsing failed, falling back to regex processing...", file=sys.stderr
+        )
+        with open(filename, "r", encoding="utf-8") as f:
+            return process_xml_stream(f, min_count)
+
+    return [
+        (surname, count)
+        for surname, count in surname_counter.most_common()
+        if count >= min_count
+    ]
+
+
+def main():
+    min_count = 10000
+
+    # Handle command line arguments
+    if len(sys.argv) > 1:
+        if sys.argv[1] in ["-h", "--help"]:
+            print(__doc__)
+            sys.exit(0)
+        elif sys.argv[1].isdigit():
+            min_count = int(sys.argv[1])
+            filename = sys.argv[2] if len(sys.argv) > 2 else None
+        else:
+            filename = sys.argv[1]
+            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
+    else:
+        filename = None
+
+    try:
+        if filename:
+            # Read from file
+            results = process_xml_file(filename, min_count)
+        else:
+            # Read from stdin (piped input)
+            results = process_xml_stream(sys.stdin, min_count)
+
+        # Output results in same format as C program
+        for surname, count in results:
+            print(f"{surname} {count}")
+
+    except KeyboardInterrupt:
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/surnames_quick_validation.sh
+++ b/surnames_quick_validation.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# quick_validation.sh - Simplified version for rapid testing
+
+echo "=== Quick Surname Validation ==="
+
+# Quick output comparison
+echo "Generating results..."
+gunzip -c dblp.xml.gz | ./surnames > c_quick.txt &
+C_PID=$!
+gunzip -c dblp.xml.gz | python3 surnames.py > python_quick.txt &
+PYTHON_PID=$!
+
+wait $C_PID $PYTHON_PID
+
+echo "C results: $(wc -l < c_quick.txt) surnames"
+echo "Python results: $(wc -l < python_quick.txt) surnames"
+
+echo "Wang comparison:"
+echo "C: $(grep "^Wang " c_quick.txt)"
+echo "Python: $(grep "^Wang " python_quick.txt)"
+
+if diff -q c_quick.txt python_quick.txt > /dev/null; then
+    echo "✓ Results identical!"
+else
+    echo "⚠ Results differ"
+fi
+
+# Quick memory check
+echo "Memory check:"
+gunzip -c dblp.xml.gz | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓ No major memory issues"
+else
+    echo "⚠ Check valgrind output"
+fi
+
+rm -f c_quick.txt python_quick.txt
+
--- a/surnames_validation.sh
+++ b/surnames_validation.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# simple_surnames_validation.sh - Robust and simple
+
+DBLP_FILE="${1:-dblp.xml.gz}"
+TEMP_DIR="/tmp/surnames_val_$$"
+
+echo "=== Simple Surname Validation ==="
+echo "Input: $DBLP_FILE"
+
+mkdir -p "$TEMP_DIR"
+
+# Sequential execution to avoid pipe issues
+echo "Running C program..."
+gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
+c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
+c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
+
+echo "Running Python validation..."
+gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
+python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
+python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
+
+echo "Results:"
+echo "  C program: $c_count surnames"
+echo "  Python script: $python_count surnames"
+echo "  C top result: $c_wang"
+echo "  Python top result: $python_wang"
+
+if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
+    echo "✓ Results are identical!"
+else
+    echo "⚠ Results differ - checking details..."
+    diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
+fi
+
+echo "Memory check..."
+echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓ No memory leaks detected"
+fi
+
+# Cleanup
+rm -rf "$TEMP_DIR"
+echo "Validation complete!"
+
--- a/validate_extraction.sh
+++ b/validate_extraction.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-echo "Running full validation with working pipeline..."
-
-# Generate complete bash results using the proven pipeline
-echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
-{
-    grep -E "<(author|editor)" dblp.xml | \
-    sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
-    awk '{
-        if ($NF ~ /^[0-9]{4}$/) { NF-- }
-        if (NF > 0) { 
-            surname = $NF
-            gsub(/&[^;]*;/, "", surname)
-            if (length(surname) > 0) print surname 
-        }
-    }' | \
-    sort | uniq -c | \
-    awk '$1 >= 10000 {print $2, $1}' | \
-    sort -k2 -nr
-} > bash_results.txt
-
-echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
-
-# Get your C program results
-./main > c_results.txt
-
-echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
-
-# Quick comparison of top entries
-echo "Top 5 comparison:"
-echo "=== C Program ==="
-head -5 c_results.txt
-echo "=== Bash Script ==="
-head -5 bash_results.txt
-
-# Check Wang specifically
-echo "Wang comparison:"
-echo "C: $(grep "^Wang " c_results.txt)"
-echo "Bash: $(grep "^Wang " bash_results.txt)"
-
Author	SHA1	Message	Date
Matthias Puchstein	8e9cd18fa6	some small fixes	2025-06-13 05:51:53 +02:00
Matthias Puchstein	aa6cde55f4	update gitignore	2025-06-13 05:05:52 +02:00
Matthias Puchstein	d4d3dca574	added new bash validation scripts	2025-06-13 05:05:33 +02:00
Matthias Puchstein	b669c7135a	added py script for validation	2025-06-13 04:58:34 +02:00
Matthias Puchstein	d35c253afa	set min count again to 10000	2025-06-13 04:57:30 +02:00
Matthias Puchstein	8f65b8c142	deleted bash script for now	2025-06-13 04:57:23 +02:00