From 8e9cd18fa6698ee05c80b614be891ec3742ccaaf Mon Sep 17 00:00:00 2001 From: Matthias Puchstein Date: Fri, 13 Jun 2025 05:51:53 +0200 Subject: [PATCH] some small fixes --- surnames.c | 9 +- surnames.py | 176 +++++++++++++++++++++++++++++--------- surnames_validation.sh | 187 ++++++----------------------------------- 3 files changed, 170 insertions(+), 202 deletions(-) diff --git a/surnames.c b/surnames.c index dc21ee0..e5e3cc4 100644 --- a/surnames.c +++ b/surnames.c @@ -9,7 +9,7 @@ #define MIN_COUNT 10000 /* - * Usage: pipe dblp.xml to the programm or have it in the same folder as the program + * Usage: pipe dblp.xml to the program or have it in the same folder as the program */ void string_ncopy(char *dest, const char *src, size_t max_len) { @@ -27,10 +27,12 @@ typedef struct person { int count; } person; -void newPerson(person *p, const char *name) { +person* newPerson(const char *name) { + person *p = (person *) malloc(sizeof(person)); string_ncopy(p->name, name, BUFFER_LENGTH); p->count = 1; p->next = NULL; + return p; } void sorted_name_insert(person **head, char *name) { @@ -43,8 +45,7 @@ void sorted_name_insert(person **head, char *name) { p = p->next; } - person *node = (person *) malloc(sizeof(person)); - newPerson(node, name); + person *node = newPerson(name); if (*head == NULL || strcmp((*head)->name, name) > 0) { node->next = *head; diff --git a/surnames.py b/surnames.py index e5b48c1..9b4f838 100644 --- a/surnames.py +++ b/surnames.py @@ -1,64 +1,162 @@ #!/usr/bin/env python3 """ -Fast DBLP Surname Extractor - Optimized for 4GB+ files +DBLP Surname Extractor +Replicates C program logic for surname frequency analysis +Usage: gunzip -c dblp.xml.gz | python surnames.py + python surnames.py dblp.xml """ -import sys import re -from collections import defaultdict +import sys +import xml.etree.ElementTree as ET +from collections import Counter -def fast_extract_surnames(input_stream, min_count=10000): - """Memory-efficient surname extraction""" - surname_counts = defaultdict(int) +def extract_surname(name_text): + """Extract surname using same logic as C program""" + if not name_text: + return None - # Compile regex for performance - tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)") - year_pattern = re.compile(r"\s+\d{4}$") - entity_pattern = re.compile(r"&[^;]*;") + # Split into words + words = name_text.strip().split() + if not words: + return None + + # Remove 4-digit year if present at end + if len(words) > 1 and re.match(r"^\d{4}$", words[-1]): + words = words[:-1] + + if not words: + return None + + # Return last word as surname + surname = words[-1] + + # Clean up XML entities (basic cleanup) + surname = re.sub(r"&[^;]*;", "", surname) + + return surname if surname else None + + +def process_xml_stream(input_stream, min_count=10000): + """Process XML from stream (for piping)""" + surname_counter = Counter() + + # Read and process line by line for memory efficiency + current_element = "" + in_author_or_editor = False + tag_name = "" for line in input_stream: - for match in tag_pattern.finditer(line): - content = match.group(2).strip() + line = line.strip() - # Remove year suffix - content = year_pattern.sub("", content) + # Check for author or editor tags + author_match = re.search( + r"<(author|editor)[^>]*>([^<]+)", line + ) + if author_match: + tag_name = author_match.group(1) + content = author_match.group(2) + surname = extract_surname(content) + if surname: + surname_counter[surname] += 1 + else: + # Handle multi-line tags + if re.search(r"<(author|editor)", line): + in_author_or_editor = True + tag_match = re.search(r"<(author|editor)", line) + tag_name = tag_match.group(1) + current_element = line + elif in_author_or_editor: + current_element += " " + line + if f"" in line: + # Extract content between tags + content_match = re.search( + rf"<{tag_name}[^>]*>([^<]+)", current_element + ) + if content_match: + content = content_match.group(1) + surname = extract_surname(content) + if surname: + surname_counter[surname] += 1 + in_author_or_editor = False + current_element = "" - # Get surname (last word) - words = content.split() - if words: - surname = words[-1] - # Clean entities - surname = entity_pattern.sub("", surname) - if surname: - surname_counts[surname] += 1 - - # Filter and sort results - results = [ + # Return surnames above threshold, sorted by count + return [ (surname, count) - for surname, count in surname_counts.items() + for surname, count in surname_counter.most_common() if count >= min_count ] - results.sort(key=lambda x: x[1], reverse=True) - return results + +def process_xml_file(filename, min_count=10000): + """Process XML file using ElementTree (more robust)""" + surname_counter = Counter() + + try: + # Parse XML incrementally for memory efficiency + context = ET.iterparse(filename, events=("start", "end")) + context = iter(context) + event, root = next(context) + + for event, elem in context: + if event == "end" and elem.tag in ["author", "editor"]: + if elem.text: + surname = extract_surname(elem.text) + if surname: + surname_counter[surname] += 1 + elem.clear() # Free memory + + except ET.ParseError: + # Fallback to line-by-line processing + print( + "XML parsing failed, falling back to regex processing...", file=sys.stderr + ) + with open(filename, "r", encoding="utf-8") as f: + return process_xml_stream(f, min_count) + + return [ + (surname, count) + for surname, count in surname_counter.most_common() + if count >= min_count + ] def main(): - min_count = ( - int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000 - ) + min_count = 10000 - if len(sys.argv) > 1 and not sys.argv[1].isdigit(): - # Read from file - with open(sys.argv[1], "r", encoding="utf-8") as f: - results = fast_extract_surnames(f, min_count) + # Handle command line arguments + if len(sys.argv) > 1: + if sys.argv[1] in ["-h", "--help"]: + print(__doc__) + sys.exit(0) + elif sys.argv[1].isdigit(): + min_count = int(sys.argv[1]) + filename = sys.argv[2] if len(sys.argv) > 2 else None + else: + filename = sys.argv[1] + min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000 else: - # Read from stdin - results = fast_extract_surnames(sys.stdin, min_count) + filename = None - for surname, count in results: - print(f"{surname} {count}") + try: + if filename: + # Read from file + results = process_xml_file(filename, min_count) + else: + # Read from stdin (piped input) + results = process_xml_stream(sys.stdin, min_count) + + # Output results in same format as C program + for surname, count in results: + print(f"{surname} {count}") + + except KeyboardInterrupt: + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/surnames_validation.sh b/surnames_validation.sh index 2d7ac00..26ac5c8 100644 --- a/surnames_validation.sh +++ b/surnames_validation.sh @@ -1,176 +1,45 @@ #!/bin/bash -# surnames_validation.sh -# Comprehensive validation script for DBLP surname extraction - -set -e # Exit on any error +# simple_surnames_validation.sh - Robust and simple DBLP_FILE="${1:-dblp.xml.gz}" -MIN_COUNT="${2:-10000}" -TEMP_DIR="/tmp/surnames_validation_$$" +TEMP_DIR="/tmp/surnames_val_$$" -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}" +echo "=== Simple Surname Validation ===" echo "Input: $DBLP_FILE" -echo "Minimum count: $MIN_COUNT" -echo "Temp directory: $TEMP_DIR" -# Create temp directory mkdir -p "$TEMP_DIR" -# Function to cleanup -cleanup() { - echo -e "\n${YELLOW}Cleaning up...${NC}" - rm -rf "$TEMP_DIR" -} -trap cleanup EXIT +# Sequential execution to avoid pipe issues +echo "Running C program..." +gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt" +c_count=$(wc -l < "$TEMP_DIR/c_results.txt") +c_wang=$(head -1 "$TEMP_DIR/c_results.txt") -# Check if files exist -if [[ ! -f "surnames" ]]; then - echo -e "${RED}Error: C program 'surnames' not found!${NC}" - exit 1 -fi +echo "Running Python validation..." +gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt" +python_count=$(wc -l < "$TEMP_DIR/python_results.txt") +python_wang=$(head -1 "$TEMP_DIR/python_results.txt") -if [[ ! -f "surnames.py" ]]; then - echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}" - exit 1 -fi +echo "Results:" +echo " C program: $c_count surnames" +echo " Python script: $python_count surnames" +echo " C top result: $c_wang" +echo " Python top result: $python_wang" -if [[ ! -f "$DBLP_FILE" ]]; then - echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}" - exit 1 -fi - -echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}" -echo "Running valgrind on C program..." -if command -v valgrind &> /dev/null; then - gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \ - --track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \ - ./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1 - - echo "Valgrind results:" - if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then - echo -e "${GREEN}✓ No memory leaks detected${NC}" - else - echo -e "${RED}⚠ Potential memory issues detected${NC}" - grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5 - fi - echo "Full valgrind log: $TEMP_DIR/valgrind.log" +if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then + echo "✓ Results are identical!" else - echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}" + echo "⚠ Results differ - checking details..." + diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10 fi -echo -e "\n${BLUE}=== Performance Profiling ===${NC}" - -# C program performance -echo "Profiling C program..." -if command -v perf &> /dev/null; then - echo "Using perf for detailed profiling..." - gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log" - echo "Perf results:" - grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4 -else - echo "Using time for basic profiling..." - gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log" - echo "Time results:" - grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log" +echo "Memory check..." +echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓ No memory leaks detected" fi -# Python program performance -echo -e "\nProfiling Python program..." -gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log" -echo "Python time results:" -grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log" - -echo -e "\n${BLUE}=== Output Validation ===${NC}" - -# Compare line counts -c_lines=$(wc -l < "$TEMP_DIR/c_results.txt") -python_lines=$(wc -l < "$TEMP_DIR/python_results.txt") - -echo "Result counts:" -echo " C program: $c_lines surnames" -echo " Python script: $python_lines surnames" - -if [[ $c_lines -eq $python_lines ]]; then - echo -e "${GREEN}✓ Line counts match${NC}" -else - echo -e "${YELLOW}⚠ Line counts differ${NC}" -fi - -# Compare top 10 results -echo -e "\nTop 10 comparison:" -echo -e "${BLUE}C Program:${NC}" -head -10 "$TEMP_DIR/c_results.txt" -echo -e "${BLUE}Python Script:${NC}" -head -10 "$TEMP_DIR/python_results.txt" - -# Detailed difference analysis -echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}" -if command -v diff &> /dev/null; then - echo "Running diff analysis..." - if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then - echo -e "${GREEN}✓ Results are identical!${NC}" - else - echo -e "${YELLOW}⚠ Found differences:${NC}" - diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20 - - # Count differences - diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l) - echo "Total difference lines: $diff_count" - fi -else - echo "diff not available, using comm..." - comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10 -fi - -# Specific validation checks -echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}" - -# Check Wang count (most frequent) -c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}') -python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}') - -echo "Wang frequency check:" -echo " C: $c_wang" -echo " Python: $python_wang" -if [[ "$c_wang" == "$python_wang" ]]; then - echo -e "${GREEN}✓ Wang counts match${NC}" -else - echo -e "${RED}✗ Wang counts differ${NC}" -fi - -# Check for duplicates in C results -echo -e "\nDuplicate check (C program):" -duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l) -if [[ $duplicates -eq 0 ]]; then - echo -e "${GREEN}✓ No duplicates found${NC}" -else - echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}" - awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5 -fi - -# Performance comparison -echo -e "\n${BLUE}=== Performance Summary ===${NC}" -c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1) -python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1) - -echo "Execution times:" -echo " C program: $c_time" -echo " Python script: $python_time" - -# Save results for later analysis -echo -e "\n${BLUE}=== Results Saved ===${NC}" -echo "Results saved in: $TEMP_DIR" -echo " C results: $TEMP_DIR/c_results.txt" -echo " Python results: $TEMP_DIR/python_results.txt" -echo " Valgrind log: $TEMP_DIR/valgrind.log" -echo " Performance logs: $TEMP_DIR/*_time.log" - -echo -e "\n${GREEN}Validation complete!${NC}" +# Cleanup +rm -rf "$TEMP_DIR" +echo "Validation complete!"