some small fixes

2025-06-13 05:51:53 +02:00
parent aa6cde55f4
commit 8e9cd18fa6
3 changed files with 170 additions and 202 deletions
--- a/surnames.c
+++ b/surnames.c
@@ -9,7 +9,7 @@
 #define MIN_COUNT 10000
 /*
- * Usage: pipe dblp.xml to the programm or have it in the same folder as the program
+ * Usage: pipe dblp.xml to the program or have it in the same folder as the program
 */
 void string_ncopy(char *dest, const char *src, size_t max_len) {
@@ -27,10 +27,12 @@ typedef struct person {
    int count;
 } person;
-void newPerson(person *p, const char *name) {
+person* newPerson(const char *name) {
    person *p = (person *) malloc(sizeof(person));
    string_ncopy(p->name, name, BUFFER_LENGTH);
    p->count = 1;
    p->next = NULL;
    return p;
 }
 void sorted_name_insert(person **head, char *name) {
@@ -43,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
        p = p->next;
    }
-    person *node = (person *) malloc(sizeof(person));
+    person *node = newPerson(name);
    newPerson(node, name);
    if (*head == NULL || strcmp((*head)->name, name) > 0) {
        node->next = *head;
--- a/surnames.py
+++ b/surnames.py
@@ -1,64 +1,162 @@
 #!/usr/bin/env python3
 """
-Fast DBLP Surname Extractor - Optimized for 4GB+ files
+DBLP Surname Extractor
 Replicates C program logic for surname frequency analysis
 Usage: gunzip -c dblp.xml.gz | python surnames.py
       python surnames.py dblp.xml
 """
 import sys
 import re
-from collections import defaultdict
+import sys
 import xml.etree.ElementTree as ET
 from collections import Counter
-def fast_extract_surnames(input_stream, min_count=10000):
+def extract_surname(name_text):
-    """Memory-efficient surname extraction"""
+    """Extract surname using same logic as C program"""
-    surname_counts = defaultdict(int)
+    if not name_text:
        return None
-    # Compile regex for performance
+    # Split into words
-    tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
+    words = name_text.strip().split()
-    year_pattern = re.compile(r"\s+\d{4}$")
+    if not words:
-    entity_pattern = re.compile(r"&[^;]*;")
+        return None
    # Remove 4-digit year if present at end
    if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
        words = words[:-1]
    if not words:
        return None
    # Return last word as surname
    surname = words[-1]
    # Clean up XML entities (basic cleanup)
    surname = re.sub(r"&[^;]*;", "", surname)
    return surname if surname else None
 def process_xml_stream(input_stream, min_count=10000):
    """Process XML from stream (for piping)"""
    surname_counter = Counter()
    # Read and process line by line for memory efficiency
    current_element = ""
    in_author_or_editor = False
    tag_name = ""
    for line in input_stream:
-        for match in tag_pattern.finditer(line):
+        line = line.strip()
            content = match.group(2).strip()
-            # Remove year suffix
+        # Check for author or editor tags
-            content = year_pattern.sub("", content)
+        author_match = re.search(
            r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
        )
        if author_match:
            tag_name = author_match.group(1)
            content = author_match.group(2)
            surname = extract_surname(content)
            if surname:
                surname_counter[surname] += 1
        else:
            # Handle multi-line tags
            if re.search(r"<(author|editor)", line):
                in_author_or_editor = True
                tag_match = re.search(r"<(author|editor)", line)
                tag_name = tag_match.group(1)
                current_element = line
            elif in_author_or_editor:
                current_element += " " + line
                if f"</{tag_name}>" in line:
                    # Extract content between tags
                    content_match = re.search(
                        rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
                    )
                    if content_match:
                        content = content_match.group(1)
                        surname = extract_surname(content)
                        if surname:
                            surname_counter[surname] += 1
                    in_author_or_editor = False
                    current_element = ""
-            # Get surname (last word)
+    # Return surnames above threshold, sorted by count
-            words = content.split()
+    return [
            if words:
                surname = words[-1]
                # Clean entities
                surname = entity_pattern.sub("", surname)
                if surname:
                    surname_counts[surname] += 1
    # Filter and sort results
    results = [
        (surname, count)
-        for surname, count in surname_counts.items()
+        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]
    results.sort(key=lambda x: x[1], reverse=True)
-    return results
+
 def process_xml_file(filename, min_count=10000):
    """Process XML file using ElementTree (more robust)"""
    surname_counter = Counter()
    try:
        # Parse XML incrementally for memory efficiency
        context = ET.iterparse(filename, events=("start", "end"))
        context = iter(context)
        event, root = next(context)
        for event, elem in context:
            if event == "end" and elem.tag in ["author", "editor"]:
                if elem.text:
                    surname = extract_surname(elem.text)
                    if surname:
                        surname_counter[surname] += 1
                elem.clear()  # Free memory
    except ET.ParseError:
        # Fallback to line-by-line processing
        print(
            "XML parsing failed, falling back to regex processing...", file=sys.stderr
        )
        with open(filename, "r", encoding="utf-8") as f:
            return process_xml_stream(f, min_count)
    return [
        (surname, count)
        for surname, count in surname_counter.most_common()
        if count >= min_count
    ]
 def main():
-    min_count = (
+    min_count = 10000
        int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
    )
-    if len(sys.argv) > 1 and not sys.argv[1].isdigit():
+    # Handle command line arguments
-        # Read from file
+    if len(sys.argv) > 1:
-        with open(sys.argv[1], "r", encoding="utf-8") as f:
+        if sys.argv[1] in ["-h", "--help"]:
-            results = fast_extract_surnames(f, min_count)
+            print(__doc__)
            sys.exit(0)
        elif sys.argv[1].isdigit():
            min_count = int(sys.argv[1])
            filename = sys.argv[2] if len(sys.argv) > 2 else None
        else:
            filename = sys.argv[1]
            min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
    else:
-        # Read from stdin
+        filename = None
        results = fast_extract_surnames(sys.stdin, min_count)
-    for surname, count in results:
+    try:
-        print(f"{surname} {count}")
+        if filename:
            # Read from file
            results = process_xml_file(filename, min_count)
        else:
            # Read from stdin (piped input)
            results = process_xml_stream(sys.stdin, min_count)
        # Output results in same format as C program
        for surname, count in results:
            print(f"{surname} {count}")
    except KeyboardInterrupt:
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == "__main__":
--- a/surnames_validation.sh
+++ b/surnames_validation.sh
@@ -1,176 +1,45 @@
 #!/bin/bash
-# surnames_validation.sh
+# simple_surnames_validation.sh - Robust and simple
 # Comprehensive validation script for DBLP surname extraction
 set -e  # Exit on any error
 DBLP_FILE="${1:-dblp.xml.gz}"
-MIN_COUNT="${2:-10000}"
+TEMP_DIR="/tmp/surnames_val_$$"
 TEMP_DIR="/tmp/surnames_validation_$$"
-# Colors for output
+echo "=== Simple Surname Validation ==="
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}"
 echo "Input: $DBLP_FILE"
 echo "Minimum count: $MIN_COUNT"
 echo "Temp directory: $TEMP_DIR"
 # Create temp directory
 mkdir -p "$TEMP_DIR"
-# Function to cleanup
+# Sequential execution to avoid pipe issues
-cleanup() {
+echo "Running C program..."
-    echo -e "\n${YELLOW}Cleaning up...${NC}"
+gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
-    rm -rf "$TEMP_DIR"
+c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
-}
+c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
 trap cleanup EXIT
-# Check if files exist
+echo "Running Python validation..."
-if [[ ! -f "surnames" ]]; then
+gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
-    echo -e "${RED}Error: C program 'surnames' not found!${NC}"
+python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
-    exit 1
+python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
 fi
-if [[ ! -f "surnames.py" ]]; then
+echo "Results:"
-    echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}"
+echo "  C program: $c_count surnames"
-    exit 1
+echo "  Python script: $python_count surnames"
-fi
+echo "  C top result: $c_wang"
 echo "  Python top result: $python_wang"
-if [[ ! -f "$DBLP_FILE" ]]; then
+if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
-    echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}"
+    echo "✓ Results are identical!"
    exit 1
 fi
 echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}"
 echo "Running valgrind on C program..."
 if command -v valgrind &> /dev/null; then
    gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \
        --track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \
        ./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1
    echo "Valgrind results:"
    if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then
        echo -e "${GREEN}✓ No memory leaks detected${NC}"
    else
        echo -e "${RED}⚠ Potential memory issues detected${NC}"
        grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5
    fi
    echo "Full valgrind log: $TEMP_DIR/valgrind.log"
 else
-    echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}"
+    echo "⚠ Results differ - checking details..."
    diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
 fi
-echo -e "\n${BLUE}=== Performance Profiling ===${NC}"
+echo "Memory check..."
-
+echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
-# C program performance
+if [ $? -eq 0 ]; then
-echo "Profiling C program..."
+    echo "✓ No memory leaks detected"
 if command -v perf &> /dev/null; then
    echo "Using perf for detailed profiling..."
    gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log"
    echo "Perf results:"
    grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4
 else
    echo "Using time for basic profiling..."
    gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log"
    echo "Time results:"
    grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log"
 fi
-# Python program performance
+# Cleanup
-echo -e "\nProfiling Python program..."
+rm -rf "$TEMP_DIR"
-gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log"
+echo "Validation complete!"
 echo "Python time results:"
 grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log"
 echo -e "\n${BLUE}=== Output Validation ===${NC}"
 # Compare line counts
 c_lines=$(wc -l < "$TEMP_DIR/c_results.txt")
 python_lines=$(wc -l < "$TEMP_DIR/python_results.txt")
 echo "Result counts:"
 echo "  C program: $c_lines surnames"
 echo "  Python script: $python_lines surnames"
 if [[ $c_lines -eq $python_lines ]]; then
    echo -e "${GREEN}✓ Line counts match${NC}"
 else
    echo -e "${YELLOW}⚠ Line counts differ${NC}"
 fi
 # Compare top 10 results
 echo -e "\nTop 10 comparison:"
 echo -e "${BLUE}C Program:${NC}"
 head -10 "$TEMP_DIR/c_results.txt"
 echo -e "${BLUE}Python Script:${NC}"
 head -10 "$TEMP_DIR/python_results.txt"
 # Detailed difference analysis
 echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}"
 if command -v diff &> /dev/null; then
    echo "Running diff analysis..."
    if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
        echo -e "${GREEN}✓ Results are identical!${NC}"
    else
        echo -e "${YELLOW}⚠ Found differences:${NC}"
        diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20
        # Count differences
        diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l)
        echo "Total difference lines: $diff_count"
    fi
 else
    echo "diff not available, using comm..."
    comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10
 fi
 # Specific validation checks
 echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}"
 # Check Wang count (most frequent)
 c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}')
 python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}')
 echo "Wang frequency check:"
 echo "  C: $c_wang"
 echo "  Python: $python_wang"
 if [[ "$c_wang" == "$python_wang" ]]; then
    echo -e "${GREEN}✓ Wang counts match${NC}"
 else
    echo -e "${RED}✗ Wang counts differ${NC}"
 fi
 # Check for duplicates in C results
 echo -e "\nDuplicate check (C program):"
 duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l)
 if [[ $duplicates -eq 0 ]]; then
    echo -e "${GREEN}✓ No duplicates found${NC}"
 else
    echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}"
    awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5
 fi
 # Performance comparison
 echo -e "\n${BLUE}=== Performance Summary ===${NC}"
 c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1)
 python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1)
 echo "Execution times:"
 echo "  C program: $c_time"
 echo "  Python script: $python_time"
 # Save results for later analysis
 echo -e "\n${BLUE}=== Results Saved ===${NC}"
 echo "Results saved in: $TEMP_DIR"
 echo "  C results: $TEMP_DIR/c_results.txt"
 echo "  Python results: $TEMP_DIR/python_results.txt"
 echo "  Valgrind log: $TEMP_DIR/valgrind.log"
 echo "  Performance logs: $TEMP_DIR/*_time.log"
 echo -e "\n${GREEN}Validation complete!${NC}"