#!/bin/bash echo "Running full validation with working pipeline..." # Generate complete bash results using the proven pipeline echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..." { grep -E "<(author|editor)" dblp.xml | \ sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \ awk '{ if ($NF ~ /^[0-9]{4}$/) { NF-- } if (NF > 0) { surname = $NF gsub(/&[^;]*;/, "", surname) if (length(surname) > 0) print surname } }' | \ sort | uniq -c | \ awk '$1 >= 10000 {print $2, $1}' | \ sort -k2 -nr } > bash_results.txt echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames" # Get your C program results ./main > c_results.txt echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames" # Quick comparison of top entries echo "Top 5 comparison:" echo "=== C Program ===" head -5 c_results.txt echo "=== Bash Script ===" head -5 bash_results.txt # Check Wang specifically echo "Wang comparison:" echo "C: $(grep "^Wang " c_results.txt)" echo "Bash: $(grep "^Wang " bash_results.txt)"