Files
c_prog/validate_extraction.sh

42 lines
1.1 KiB
Bash
Executable File

#!/bin/bash
echo "Running full validation with working pipeline..."
# Generate complete bash results using the proven pipeline
echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
{
grep -E "<(author|editor)" dblp.xml | \
sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
awk '{
if ($NF ~ /^[0-9]{4}$/) { NF-- }
if (NF > 0) {
surname = $NF
gsub(/&[^;]*;/, "", surname)
if (length(surname) > 0) print surname
}
}' | \
sort | uniq -c | \
awk '$1 >= 10000 {print $2, $1}' | \
sort -k2 -nr
} > bash_results.txt
echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
# Get your C program results
./main > c_results.txt
echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
# Quick comparison of top entries
echo "Top 5 comparison:"
echo "=== C Program ==="
head -5 c_results.txt
echo "=== Bash Script ==="
head -5 bash_results.txt
# Check Wang specifically
echo "Wang comparison:"
echo "C: $(grep "^Wang " c_results.txt)"
echo "Bash: $(grep "^Wang " bash_results.txt)"