From 8e9cd18fa6698ee05c80b614be891ec3742ccaaf Mon Sep 17 00:00:00 2001
From: Matthias Puchstein <matthias@puchstein.bayern>
Date: Fri, 13 Jun 2025 05:51:53 +0200
Subject: [PATCH] some small fixes

---
 surnames.c             |   9 +-
 surnames.py            | 176 +++++++++++++++++++++++++++++---------
 surnames_validation.sh | 187 ++++++-----------------------------------
 3 files changed, 170 insertions(+), 202 deletions(-)

diff --git a/surnames.c b/surnames.c
index dc21ee0..e5e3cc4 100644
--- a/surnames.c
+++ b/surnames.c
@@ -9,7 +9,7 @@
 #define MIN_COUNT 10000
 
 /*
- * Usage: pipe dblp.xml to the programm or have it in the same folder as the program
+ * Usage: pipe dblp.xml to the program or have it in the same folder as the program
  */
 
 void string_ncopy(char *dest, const char *src, size_t max_len) {
@@ -27,10 +27,12 @@ typedef struct person {
     int count;
 } person;
 
-void newPerson(person *p, const char *name) {
+person* newPerson(const char *name) {
+    person *p = (person *) malloc(sizeof(person));
     string_ncopy(p->name, name, BUFFER_LENGTH);
     p->count = 1;
     p->next = NULL;
+    return p;
 }
 
 void sorted_name_insert(person **head, char *name) {
@@ -43,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
         p = p->next;
     }
 
-    person *node = (person *) malloc(sizeof(person));
-    newPerson(node, name);
+    person *node = newPerson(name);
 
     if (*head == NULL || strcmp((*head)->name, name) > 0) {
         node->next = *head;
diff --git a/surnames.py b/surnames.py
index e5b48c1..9b4f838 100644
--- a/surnames.py
+++ b/surnames.py
@@ -1,64 +1,162 @@
 #!/usr/bin/env python3
 """
-Fast DBLP Surname Extractor - Optimized for 4GB+ files
+DBLP Surname Extractor
+Replicates C program logic for surname frequency analysis
+Usage: gunzip -c dblp.xml.gz | python surnames.py
+       python surnames.py dblp.xml
 """
 
-import sys
 import re
-from collections import defaultdict
+import sys
+import xml.etree.ElementTree as ET
+from collections import Counter
 
 
-def fast_extract_surnames(input_stream, min_count=10000):
-    """Memory-efficient surname extraction"""
-    surname_counts = defaultdict(int)
+def extract_surname(name_text):
+    """Extract surname using same logic as C program"""
+    if not name_text:
+        return None
 
-    # Compile regex for performance
-    tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
-    year_pattern = re.compile(r"\s+\d{4}$")
-    entity_pattern = re.compile(r"&[^;]*;")
+    # Split into words
+    words = name_text.strip().split()
+    if not words:
+        return None
+
+    # Remove 4-digit year if present at end
+    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
+        words = words[:-1]
+
+    if not words:
+        return None
+
+    # Return last word as surname
+    surname = words[-1]
+
+    # Clean up XML entities (basic cleanup)
+    surname = re.sub(r"&[^;]*;", "", surname)
+
+    return surname if surname else None
+
+
+def process_xml_stream(input_stream, min_count=10000):
+    """Process XML from stream (for piping)"""
+    surname_counter = Counter()
+
+    # Read and process line by line for memory efficiency
+    current_element = ""
+    in_author_or_editor = False
+    tag_name = ""
 
     for line in input_stream:
-        for match in tag_pattern.finditer(line):
-            content = match.group(2).strip()
+        line = line.strip()
 
-            # Remove year suffix
-            content = year_pattern.sub("", content)
+        # Check for author or editor tags
+        author_match = re.search(
+            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
+        )
+        if author_match:
+            tag_name = author_match.group(1)
+            content = author_match.group(2)
+            surname = extract_surname(content)
+            if surname:
+                surname_counter[surname] += 1
+        else:
+            # Handle multi-line tags
+            if re.search(r"<(author|editor)", line):
+                in_author_or_editor = True
+                tag_match = re.search(r"<(author|editor)", line)
+                tag_name = tag_match.group(1)
+                current_element = line
+            elif in_author_or_editor:
+                current_element += " " + line
+                if f"</{tag_name}>" in line:
+                    # Extract content between tags
+                    content_match = re.search(
+                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
+                    )
+                    if content_match:
+                        content = content_match.group(1)
+                        surname = extract_surname(content)
+                        if surname:
+                            surname_counter[surname] += 1
+                    in_author_or_editor = False
+                    current_element = ""
 
-            # Get surname (last word)
-            words = content.split()
-            if words:
-                surname = words[-1]
-                # Clean entities
-                surname = entity_pattern.sub("", surname)
-                if surname:
-                    surname_counts[surname] += 1
-
-    # Filter and sort results
-    results = [
+    # Return surnames above threshold, sorted by count
+    return [
         (surname, count)
-        for surname, count in surname_counts.items()
+        for surname, count in surname_counter.most_common()
         if count >= min_count
     ]
-    results.sort(key=lambda x: x[1], reverse=True)
 
-    return results
+
+def process_xml_file(filename, min_count=10000):
+    """Process XML file using ElementTree (more robust)"""
+    surname_counter = Counter()
+
+    try:
+        # Parse XML incrementally for memory efficiency
+        context = ET.iterparse(filename, events=("start", "end"))
+        context = iter(context)
+        event, root = next(context)
+
+        for event, elem in context:
+            if event == "end" and elem.tag in ["author", "editor"]:
+                if elem.text:
+                    surname = extract_surname(elem.text)
+                    if surname:
+                        surname_counter[surname] += 1
+                elem.clear()  # Free memory
+
+    except ET.ParseError:
+        # Fallback to line-by-line processing
+        print(
+            "XML parsing failed, falling back to regex processing...", file=sys.stderr
+        )
+        with open(filename, "r", encoding="utf-8") as f:
+            return process_xml_stream(f, min_count)
+
+    return [
+        (surname, count)
+        for surname, count in surname_counter.most_common()
+        if count >= min_count
+    ]
 
 
 def main():
-    min_count = (
-        int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
-    )
+    min_count = 10000
 
-    if len(sys.argv) > 1 and not sys.argv[1].isdigit():
-        # Read from file
-        with open(sys.argv[1], "r", encoding="utf-8") as f:
-            results = fast_extract_surnames(f, min_count)
+    # Handle command line arguments
+    if len(sys.argv) > 1:
+        if sys.argv[1] in ["-h", "--help"]:
+            print(__doc__)
+            sys.exit(0)
+        elif sys.argv[1].isdigit():
+            min_count = int(sys.argv[1])
+            filename = sys.argv[2] if len(sys.argv) > 2 else None
+        else:
+            filename = sys.argv[1]
+            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
     else:
-        # Read from stdin
-        results = fast_extract_surnames(sys.stdin, min_count)
+        filename = None
 
-    for surname, count in results:
-        print(f"{surname} {count}")
+    try:
+        if filename:
+            # Read from file
+            results = process_xml_file(filename, min_count)
+        else:
+            # Read from stdin (piped input)
+            results = process_xml_stream(sys.stdin, min_count)
+
+        # Output results in same format as C program
+        for surname, count in results:
+            print(f"{surname} {count}")
+
+    except KeyboardInterrupt:
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/surnames_validation.sh b/surnames_validation.sh
index 2d7ac00..26ac5c8 100644
--- a/surnames_validation.sh
+++ b/surnames_validation.sh
@@ -1,176 +1,45 @@
 #!/bin/bash
-# surnames_validation.sh
-# Comprehensive validation script for DBLP surname extraction
-
-set -e  # Exit on any error
+# simple_surnames_validation.sh - Robust and simple
 
 DBLP_FILE="${1:-dblp.xml.gz}"
-MIN_COUNT="${2:-10000}"
-TEMP_DIR="/tmp/surnames_validation_$$"
+TEMP_DIR="/tmp/surnames_val_$$"
 
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}"
+echo "=== Simple Surname Validation ==="
 echo "Input: $DBLP_FILE"
-echo "Minimum count: $MIN_COUNT"
-echo "Temp directory: $TEMP_DIR"
 
-# Create temp directory
 mkdir -p "$TEMP_DIR"
 
-# Function to cleanup
-cleanup() {
-    echo -e "\n${YELLOW}Cleaning up...${NC}"
-    rm -rf "$TEMP_DIR"
-}
-trap cleanup EXIT
+# Sequential execution to avoid pipe issues
+echo "Running C program..."
+gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
+c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
+c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
 
-# Check if files exist
-if [[ ! -f "surnames" ]]; then
-    echo -e "${RED}Error: C program 'surnames' not found!${NC}"
-    exit 1
-fi
+echo "Running Python validation..."
+gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
+python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
+python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
 
-if [[ ! -f "surnames.py" ]]; then
-    echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}"
-    exit 1
-fi
+echo "Results:"
+echo "  C program: $c_count surnames"
+echo "  Python script: $python_count surnames"
+echo "  C top result: $c_wang"
+echo "  Python top result: $python_wang"
 
-if [[ ! -f "$DBLP_FILE" ]]; then
-    echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}"
-    exit 1
-fi
-
-echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}"
-echo "Running valgrind on C program..."
-if command -v valgrind &> /dev/null; then
-    gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \
-        --track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \
-        ./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1
-    
-    echo "Valgrind results:"
-    if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then
-        echo -e "${GREEN}✓ No memory leaks detected${NC}"
-    else
-        echo -e "${RED}⚠ Potential memory issues detected${NC}"
-        grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5
-    fi
-    echo "Full valgrind log: $TEMP_DIR/valgrind.log"
+if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
+    echo "✓ Results are identical!"
 else
-    echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}"
+    echo "⚠ Results differ - checking details..."
+    diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
 fi
 
-echo -e "\n${BLUE}=== Performance Profiling ===${NC}"
-
-# C program performance
-echo "Profiling C program..."
-if command -v perf &> /dev/null; then
-    echo "Using perf for detailed profiling..."
-    gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log"
-    echo "Perf results:"
-    grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4
-else
-    echo "Using time for basic profiling..."
-    gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log"
-    echo "Time results:"
-    grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log"
+echo "Memory check..."
+echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+    echo "✓ No memory leaks detected"
 fi
 
-# Python program performance
-echo -e "\nProfiling Python program..."
-gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log"
-echo "Python time results:"
-grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log"
-
-echo -e "\n${BLUE}=== Output Validation ===${NC}"
-
-# Compare line counts
-c_lines=$(wc -l < "$TEMP_DIR/c_results.txt")
-python_lines=$(wc -l < "$TEMP_DIR/python_results.txt")
-
-echo "Result counts:"
-echo "  C program: $c_lines surnames"
-echo "  Python script: $python_lines surnames"
-
-if [[ $c_lines -eq $python_lines ]]; then
-    echo -e "${GREEN}✓ Line counts match${NC}"
-else
-    echo -e "${YELLOW}⚠ Line counts differ${NC}"
-fi
-
-# Compare top 10 results
-echo -e "\nTop 10 comparison:"
-echo -e "${BLUE}C Program:${NC}"
-head -10 "$TEMP_DIR/c_results.txt"
-echo -e "${BLUE}Python Script:${NC}"
-head -10 "$TEMP_DIR/python_results.txt"
-
-# Detailed difference analysis
-echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}"
-if command -v diff &> /dev/null; then
-    echo "Running diff analysis..."
-    if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
-        echo -e "${GREEN}✓ Results are identical!${NC}"
-    else
-        echo -e "${YELLOW}⚠ Found differences:${NC}"
-        diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20
-        
-        # Count differences
-        diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l)
-        echo "Total difference lines: $diff_count"
-    fi
-else
-    echo "diff not available, using comm..."
-    comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10
-fi
-
-# Specific validation checks
-echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}"
-
-# Check Wang count (most frequent)
-c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}')
-python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}')
-
-echo "Wang frequency check:"
-echo "  C: $c_wang"
-echo "  Python: $python_wang"
-if [[ "$c_wang" == "$python_wang" ]]; then
-    echo -e "${GREEN}✓ Wang counts match${NC}"
-else
-    echo -e "${RED}✗ Wang counts differ${NC}"
-fi
-
-# Check for duplicates in C results
-echo -e "\nDuplicate check (C program):"
-duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l)
-if [[ $duplicates -eq 0 ]]; then
-    echo -e "${GREEN}✓ No duplicates found${NC}"
-else
-    echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}"
-    awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5
-fi
-
-# Performance comparison
-echo -e "\n${BLUE}=== Performance Summary ===${NC}"
-c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1)
-python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1)
-
-echo "Execution times:"
-echo "  C program: $c_time"
-echo "  Python script: $python_time"
-
-# Save results for later analysis
-echo -e "\n${BLUE}=== Results Saved ===${NC}"
-echo "Results saved in: $TEMP_DIR"
-echo "  C results: $TEMP_DIR/c_results.txt"
-echo "  Python results: $TEMP_DIR/python_results.txt"
-echo "  Valgrind log: $TEMP_DIR/valgrind.log"
-echo "  Performance logs: $TEMP_DIR/*_time.log"
-
-echo -e "\n${GREEN}Validation complete!${NC}"
+# Cleanup
+rm -rf "$TEMP_DIR"
+echo "Validation complete!"