#!/bin/bash # surnames_validation.sh # Comprehensive validation script for DBLP surname extraction set -e # Exit on any error DBLP_FILE="${1:-dblp.xml.gz}" MIN_COUNT="${2:-10000}" TEMP_DIR="/tmp/surnames_validation_$$" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}" echo "Input: $DBLP_FILE" echo "Minimum count: $MIN_COUNT" echo "Temp directory: $TEMP_DIR" # Create temp directory mkdir -p "$TEMP_DIR" # Function to cleanup cleanup() { echo -e "\n${YELLOW}Cleaning up...${NC}" rm -rf "$TEMP_DIR" } trap cleanup EXIT # Check if files exist if [[ ! -f "surnames" ]]; then echo -e "${RED}Error: C program 'surnames' not found!${NC}" exit 1 fi if [[ ! -f "surnames.py" ]]; then echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}" exit 1 fi if [[ ! -f "$DBLP_FILE" ]]; then echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}" exit 1 fi echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}" echo "Running valgrind on C program..." if command -v valgrind &> /dev/null; then gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \ --track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \ ./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1 echo "Valgrind results:" if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then echo -e "${GREEN}✓ No memory leaks detected${NC}" else echo -e "${RED}⚠ Potential memory issues detected${NC}" grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5 fi echo "Full valgrind log: $TEMP_DIR/valgrind.log" else echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}" fi echo -e "\n${BLUE}=== Performance Profiling ===${NC}" # C program performance echo "Profiling C program..." if command -v perf &> /dev/null; then echo "Using perf for detailed profiling..." gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log" echo "Perf results:" grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4 else echo "Using time for basic profiling..." gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log" echo "Time results:" grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log" fi # Python program performance echo -e "\nProfiling Python program..." gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log" echo "Python time results:" grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log" echo -e "\n${BLUE}=== Output Validation ===${NC}" # Compare line counts c_lines=$(wc -l < "$TEMP_DIR/c_results.txt") python_lines=$(wc -l < "$TEMP_DIR/python_results.txt") echo "Result counts:" echo " C program: $c_lines surnames" echo " Python script: $python_lines surnames" if [[ $c_lines -eq $python_lines ]]; then echo -e "${GREEN}✓ Line counts match${NC}" else echo -e "${YELLOW}⚠ Line counts differ${NC}" fi # Compare top 10 results echo -e "\nTop 10 comparison:" echo -e "${BLUE}C Program:${NC}" head -10 "$TEMP_DIR/c_results.txt" echo -e "${BLUE}Python Script:${NC}" head -10 "$TEMP_DIR/python_results.txt" # Detailed difference analysis echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}" if command -v diff &> /dev/null; then echo "Running diff analysis..." if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then echo -e "${GREEN}✓ Results are identical!${NC}" else echo -e "${YELLOW}⚠ Found differences:${NC}" diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20 # Count differences diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l) echo "Total difference lines: $diff_count" fi else echo "diff not available, using comm..." comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10 fi # Specific validation checks echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}" # Check Wang count (most frequent) c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}') python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}') echo "Wang frequency check:" echo " C: $c_wang" echo " Python: $python_wang" if [[ "$c_wang" == "$python_wang" ]]; then echo -e "${GREEN}✓ Wang counts match${NC}" else echo -e "${RED}✗ Wang counts differ${NC}" fi # Check for duplicates in C results echo -e "\nDuplicate check (C program):" duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l) if [[ $duplicates -eq 0 ]]; then echo -e "${GREEN}✓ No duplicates found${NC}" else echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}" awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5 fi # Performance comparison echo -e "\n${BLUE}=== Performance Summary ===${NC}" c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1) python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1) echo "Execution times:" echo " C program: $c_time" echo " Python script: $python_time" # Save results for later analysis echo -e "\n${BLUE}=== Results Saved ===${NC}" echo "Results saved in: $TEMP_DIR" echo " C results: $TEMP_DIR/c_results.txt" echo " Python results: $TEMP_DIR/python_results.txt" echo " Valgrind log: $TEMP_DIR/valgrind.log" echo " Performance logs: $TEMP_DIR/*_time.log" echo -e "\n${GREEN}Validation complete!${NC}"