From d4d3dca57487088850fe4f07360454f50b2b4e11 Mon Sep 17 00:00:00 2001 From: Matthias Puchstein Date: Fri, 13 Jun 2025 05:05:33 +0200 Subject: [PATCH] added new bash validation scripts --- surnames_quick_validation.sh | 38 ++++++++ surnames_validation.sh | 176 +++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 surnames_quick_validation.sh create mode 100644 surnames_validation.sh diff --git a/surnames_quick_validation.sh b/surnames_quick_validation.sh new file mode 100644 index 0000000..c8b3147 --- /dev/null +++ b/surnames_quick_validation.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# quick_validation.sh - Simplified version for rapid testing + +echo "=== Quick Surname Validation ===" + +# Quick output comparison +echo "Generating results..." +gunzip -c dblp.xml.gz | ./surnames > c_quick.txt & +C_PID=$! +gunzip -c dblp.xml.gz | python3 surnames.py > python_quick.txt & +PYTHON_PID=$! + +wait $C_PID $PYTHON_PID + +echo "C results: $(wc -l < c_quick.txt) surnames" +echo "Python results: $(wc -l < python_quick.txt) surnames" + +echo "Wang comparison:" +echo "C: $(grep "^Wang " c_quick.txt)" +echo "Python: $(grep "^Wang " python_quick.txt)" + +if diff -q c_quick.txt python_quick.txt > /dev/null; then + echo "✓ Results identical!" +else + echo "⚠ Results differ" +fi + +# Quick memory check +echo "Memory check:" +gunzip -c dblp.xml.gz | valgrind --leak-check=yes ./surnames > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "✓ No major memory issues" +else + echo "⚠ Check valgrind output" +fi + +rm -f c_quick.txt python_quick.txt + diff --git a/surnames_validation.sh b/surnames_validation.sh new file mode 100644 index 0000000..2d7ac00 --- /dev/null +++ b/surnames_validation.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# surnames_validation.sh +# Comprehensive validation script for DBLP surname extraction + +set -e # Exit on any error + +DBLP_FILE="${1:-dblp.xml.gz}" +MIN_COUNT="${2:-10000}" +TEMP_DIR="/tmp/surnames_validation_$$" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}" +echo "Input: $DBLP_FILE" +echo "Minimum count: $MIN_COUNT" +echo "Temp directory: $TEMP_DIR" + +# Create temp directory +mkdir -p "$TEMP_DIR" + +# Function to cleanup +cleanup() { + echo -e "\n${YELLOW}Cleaning up...${NC}" + rm -rf "$TEMP_DIR" +} +trap cleanup EXIT + +# Check if files exist +if [[ ! -f "surnames" ]]; then + echo -e "${RED}Error: C program 'surnames' not found!${NC}" + exit 1 +fi + +if [[ ! -f "surnames.py" ]]; then + echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}" + exit 1 +fi + +if [[ ! -f "$DBLP_FILE" ]]; then + echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}" + exit 1 +fi + +echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}" +echo "Running valgrind on C program..." +if command -v valgrind &> /dev/null; then + gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \ + --track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \ + ./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1 + + echo "Valgrind results:" + if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then + echo -e "${GREEN}✓ No memory leaks detected${NC}" + else + echo -e "${RED}⚠ Potential memory issues detected${NC}" + grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5 + fi + echo "Full valgrind log: $TEMP_DIR/valgrind.log" +else + echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}" +fi + +echo -e "\n${BLUE}=== Performance Profiling ===${NC}" + +# C program performance +echo "Profiling C program..." +if command -v perf &> /dev/null; then + echo "Using perf for detailed profiling..." + gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log" + echo "Perf results:" + grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4 +else + echo "Using time for basic profiling..." + gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log" + echo "Time results:" + grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log" +fi + +# Python program performance +echo -e "\nProfiling Python program..." +gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log" +echo "Python time results:" +grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log" + +echo -e "\n${BLUE}=== Output Validation ===${NC}" + +# Compare line counts +c_lines=$(wc -l < "$TEMP_DIR/c_results.txt") +python_lines=$(wc -l < "$TEMP_DIR/python_results.txt") + +echo "Result counts:" +echo " C program: $c_lines surnames" +echo " Python script: $python_lines surnames" + +if [[ $c_lines -eq $python_lines ]]; then + echo -e "${GREEN}✓ Line counts match${NC}" +else + echo -e "${YELLOW}⚠ Line counts differ${NC}" +fi + +# Compare top 10 results +echo -e "\nTop 10 comparison:" +echo -e "${BLUE}C Program:${NC}" +head -10 "$TEMP_DIR/c_results.txt" +echo -e "${BLUE}Python Script:${NC}" +head -10 "$TEMP_DIR/python_results.txt" + +# Detailed difference analysis +echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}" +if command -v diff &> /dev/null; then + echo "Running diff analysis..." + if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then + echo -e "${GREEN}✓ Results are identical!${NC}" + else + echo -e "${YELLOW}⚠ Found differences:${NC}" + diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20 + + # Count differences + diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l) + echo "Total difference lines: $diff_count" + fi +else + echo "diff not available, using comm..." + comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10 +fi + +# Specific validation checks +echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}" + +# Check Wang count (most frequent) +c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}') +python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}') + +echo "Wang frequency check:" +echo " C: $c_wang" +echo " Python: $python_wang" +if [[ "$c_wang" == "$python_wang" ]]; then + echo -e "${GREEN}✓ Wang counts match${NC}" +else + echo -e "${RED}✗ Wang counts differ${NC}" +fi + +# Check for duplicates in C results +echo -e "\nDuplicate check (C program):" +duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l) +if [[ $duplicates -eq 0 ]]; then + echo -e "${GREEN}✓ No duplicates found${NC}" +else + echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}" + awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5 +fi + +# Performance comparison +echo -e "\n${BLUE}=== Performance Summary ===${NC}" +c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1) +python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1) + +echo "Execution times:" +echo " C program: $c_time" +echo " Python script: $python_time" + +# Save results for later analysis +echo -e "\n${BLUE}=== Results Saved ===${NC}" +echo "Results saved in: $TEMP_DIR" +echo " C results: $TEMP_DIR/c_results.txt" +echo " Python results: $TEMP_DIR/python_results.txt" +echo " Valgrind log: $TEMP_DIR/valgrind.log" +echo " Performance logs: $TEMP_DIR/*_time.log" + +echo -e "\n${GREEN}Validation complete!${NC}" +