#!/bin/bash

echo "Running full validation with working pipeline..."

# Generate complete bash results using the proven pipeline
echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
{
    grep -E "<(author|editor)" dblp.xml | \
    sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
    awk '{
        if ($NF ~ /^[0-9]{4}$/) { NF-- }
        if (NF > 0) { 
            surname = $NF
            gsub(/&[^;]*;/, "", surname)
            if (length(surname) > 0) print surname 
        }
    }' | \
    sort | uniq -c | \
    awk '$1 >= 10000 {print $2, $1}' | \
    sort -k2 -nr
} > bash_results.txt

echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"

# Get your C program results
./main > c_results.txt

echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"

# Quick comparison of top entries
echo "Top 5 comparison:"
echo "=== C Program ==="
head -5 c_results.txt
echo "=== Bash Script ==="
head -5 bash_results.txt

# Check Wang specifically
echo "Wang comparison:"
echo "C: $(grep "^Wang " c_results.txt)"
echo "Bash: $(grep "^Wang " bash_results.txt)"