177 lines
5.7 KiB
Bash
177 lines
5.7 KiB
Bash
#!/bin/bash
|
|
# surnames_validation.sh
|
|
# Comprehensive validation script for DBLP surname extraction
|
|
|
|
set -e # Exit on any error
|
|
|
|
DBLP_FILE="${1:-dblp.xml.gz}"
|
|
MIN_COUNT="${2:-10000}"
|
|
TEMP_DIR="/tmp/surnames_validation_$$"
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}"
|
|
echo "Input: $DBLP_FILE"
|
|
echo "Minimum count: $MIN_COUNT"
|
|
echo "Temp directory: $TEMP_DIR"
|
|
|
|
# Create temp directory
|
|
mkdir -p "$TEMP_DIR"
|
|
|
|
# Function to cleanup
|
|
cleanup() {
|
|
echo -e "\n${YELLOW}Cleaning up...${NC}"
|
|
rm -rf "$TEMP_DIR"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Check if files exist
|
|
if [[ ! -f "surnames" ]]; then
|
|
echo -e "${RED}Error: C program 'surnames' not found!${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -f "surnames.py" ]]; then
|
|
echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -f "$DBLP_FILE" ]]; then
|
|
echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}"
|
|
echo "Running valgrind on C program..."
|
|
if command -v valgrind &> /dev/null; then
|
|
gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \
|
|
--track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \
|
|
./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1
|
|
|
|
echo "Valgrind results:"
|
|
if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then
|
|
echo -e "${GREEN}✓ No memory leaks detected${NC}"
|
|
else
|
|
echo -e "${RED}⚠ Potential memory issues detected${NC}"
|
|
grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5
|
|
fi
|
|
echo "Full valgrind log: $TEMP_DIR/valgrind.log"
|
|
else
|
|
echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}"
|
|
fi
|
|
|
|
echo -e "\n${BLUE}=== Performance Profiling ===${NC}"
|
|
|
|
# C program performance
|
|
echo "Profiling C program..."
|
|
if command -v perf &> /dev/null; then
|
|
echo "Using perf for detailed profiling..."
|
|
gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log"
|
|
echo "Perf results:"
|
|
grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4
|
|
else
|
|
echo "Using time for basic profiling..."
|
|
gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log"
|
|
echo "Time results:"
|
|
grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log"
|
|
fi
|
|
|
|
# Python program performance
|
|
echo -e "\nProfiling Python program..."
|
|
gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log"
|
|
echo "Python time results:"
|
|
grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log"
|
|
|
|
echo -e "\n${BLUE}=== Output Validation ===${NC}"
|
|
|
|
# Compare line counts
|
|
c_lines=$(wc -l < "$TEMP_DIR/c_results.txt")
|
|
python_lines=$(wc -l < "$TEMP_DIR/python_results.txt")
|
|
|
|
echo "Result counts:"
|
|
echo " C program: $c_lines surnames"
|
|
echo " Python script: $python_lines surnames"
|
|
|
|
if [[ $c_lines -eq $python_lines ]]; then
|
|
echo -e "${GREEN}✓ Line counts match${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠ Line counts differ${NC}"
|
|
fi
|
|
|
|
# Compare top 10 results
|
|
echo -e "\nTop 10 comparison:"
|
|
echo -e "${BLUE}C Program:${NC}"
|
|
head -10 "$TEMP_DIR/c_results.txt"
|
|
echo -e "${BLUE}Python Script:${NC}"
|
|
head -10 "$TEMP_DIR/python_results.txt"
|
|
|
|
# Detailed difference analysis
|
|
echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}"
|
|
if command -v diff &> /dev/null; then
|
|
echo "Running diff analysis..."
|
|
if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
|
|
echo -e "${GREEN}✓ Results are identical!${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠ Found differences:${NC}"
|
|
diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20
|
|
|
|
# Count differences
|
|
diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l)
|
|
echo "Total difference lines: $diff_count"
|
|
fi
|
|
else
|
|
echo "diff not available, using comm..."
|
|
comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10
|
|
fi
|
|
|
|
# Specific validation checks
|
|
echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}"
|
|
|
|
# Check Wang count (most frequent)
|
|
c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}')
|
|
python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}')
|
|
|
|
echo "Wang frequency check:"
|
|
echo " C: $c_wang"
|
|
echo " Python: $python_wang"
|
|
if [[ "$c_wang" == "$python_wang" ]]; then
|
|
echo -e "${GREEN}✓ Wang counts match${NC}"
|
|
else
|
|
echo -e "${RED}✗ Wang counts differ${NC}"
|
|
fi
|
|
|
|
# Check for duplicates in C results
|
|
echo -e "\nDuplicate check (C program):"
|
|
duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l)
|
|
if [[ $duplicates -eq 0 ]]; then
|
|
echo -e "${GREEN}✓ No duplicates found${NC}"
|
|
else
|
|
echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}"
|
|
awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5
|
|
fi
|
|
|
|
# Performance comparison
|
|
echo -e "\n${BLUE}=== Performance Summary ===${NC}"
|
|
c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1)
|
|
python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1)
|
|
|
|
echo "Execution times:"
|
|
echo " C program: $c_time"
|
|
echo " Python script: $python_time"
|
|
|
|
# Save results for later analysis
|
|
echo -e "\n${BLUE}=== Results Saved ===${NC}"
|
|
echo "Results saved in: $TEMP_DIR"
|
|
echo " C results: $TEMP_DIR/c_results.txt"
|
|
echo " Python results: $TEMP_DIR/python_results.txt"
|
|
echo " Valgrind log: $TEMP_DIR/valgrind.log"
|
|
echo " Performance logs: $TEMP_DIR/*_time.log"
|
|
|
|
echo -e "\n${GREEN}Validation complete!${NC}"
|
|
|