some small fixes
This commit is contained in:
@@ -9,7 +9,7 @@
|
|||||||
#define MIN_COUNT 10000
|
#define MIN_COUNT 10000
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Usage: pipe dblp.xml to the programm or have it in the same folder as the program
|
* Usage: pipe dblp.xml to the program or have it in the same folder as the program
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
||||||
@@ -27,10 +27,12 @@ typedef struct person {
|
|||||||
int count;
|
int count;
|
||||||
} person;
|
} person;
|
||||||
|
|
||||||
void newPerson(person *p, const char *name) {
|
person* newPerson(const char *name) {
|
||||||
|
person *p = (person *) malloc(sizeof(person));
|
||||||
string_ncopy(p->name, name, BUFFER_LENGTH);
|
string_ncopy(p->name, name, BUFFER_LENGTH);
|
||||||
p->count = 1;
|
p->count = 1;
|
||||||
p->next = NULL;
|
p->next = NULL;
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sorted_name_insert(person **head, char *name) {
|
void sorted_name_insert(person **head, char *name) {
|
||||||
@@ -43,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
|
|||||||
p = p->next;
|
p = p->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
person *node = (person *) malloc(sizeof(person));
|
person *node = newPerson(name);
|
||||||
newPerson(node, name);
|
|
||||||
|
|
||||||
if (*head == NULL || strcmp((*head)->name, name) > 0) {
|
if (*head == NULL || strcmp((*head)->name, name) > 0) {
|
||||||
node->next = *head;
|
node->next = *head;
|
||||||
|
176
surnames.py
176
surnames.py
@@ -1,64 +1,162 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Fast DBLP Surname Extractor - Optimized for 4GB+ files
|
DBLP Surname Extractor
|
||||||
|
Replicates C program logic for surname frequency analysis
|
||||||
|
Usage: gunzip -c dblp.xml.gz | python surnames.py
|
||||||
|
python surnames.py dblp.xml
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
import sys
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
def fast_extract_surnames(input_stream, min_count=10000):
|
def extract_surname(name_text):
|
||||||
"""Memory-efficient surname extraction"""
|
"""Extract surname using same logic as C program"""
|
||||||
surname_counts = defaultdict(int)
|
if not name_text:
|
||||||
|
return None
|
||||||
|
|
||||||
# Compile regex for performance
|
# Split into words
|
||||||
tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
|
words = name_text.strip().split()
|
||||||
year_pattern = re.compile(r"\s+\d{4}$")
|
if not words:
|
||||||
entity_pattern = re.compile(r"&[^;]*;")
|
return None
|
||||||
|
|
||||||
|
# Remove 4-digit year if present at end
|
||||||
|
if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
|
||||||
|
words = words[:-1]
|
||||||
|
|
||||||
|
if not words:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Return last word as surname
|
||||||
|
surname = words[-1]
|
||||||
|
|
||||||
|
# Clean up XML entities (basic cleanup)
|
||||||
|
surname = re.sub(r"&[^;]*;", "", surname)
|
||||||
|
|
||||||
|
return surname if surname else None
|
||||||
|
|
||||||
|
|
||||||
|
def process_xml_stream(input_stream, min_count=10000):
|
||||||
|
"""Process XML from stream (for piping)"""
|
||||||
|
surname_counter = Counter()
|
||||||
|
|
||||||
|
# Read and process line by line for memory efficiency
|
||||||
|
current_element = ""
|
||||||
|
in_author_or_editor = False
|
||||||
|
tag_name = ""
|
||||||
|
|
||||||
for line in input_stream:
|
for line in input_stream:
|
||||||
for match in tag_pattern.finditer(line):
|
line = line.strip()
|
||||||
content = match.group(2).strip()
|
|
||||||
|
|
||||||
# Remove year suffix
|
# Check for author or editor tags
|
||||||
content = year_pattern.sub("", content)
|
author_match = re.search(
|
||||||
|
r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
|
||||||
|
)
|
||||||
|
if author_match:
|
||||||
|
tag_name = author_match.group(1)
|
||||||
|
content = author_match.group(2)
|
||||||
|
surname = extract_surname(content)
|
||||||
|
if surname:
|
||||||
|
surname_counter[surname] += 1
|
||||||
|
else:
|
||||||
|
# Handle multi-line tags
|
||||||
|
if re.search(r"<(author|editor)", line):
|
||||||
|
in_author_or_editor = True
|
||||||
|
tag_match = re.search(r"<(author|editor)", line)
|
||||||
|
tag_name = tag_match.group(1)
|
||||||
|
current_element = line
|
||||||
|
elif in_author_or_editor:
|
||||||
|
current_element += " " + line
|
||||||
|
if f"</{tag_name}>" in line:
|
||||||
|
# Extract content between tags
|
||||||
|
content_match = re.search(
|
||||||
|
rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
|
||||||
|
)
|
||||||
|
if content_match:
|
||||||
|
content = content_match.group(1)
|
||||||
|
surname = extract_surname(content)
|
||||||
|
if surname:
|
||||||
|
surname_counter[surname] += 1
|
||||||
|
in_author_or_editor = False
|
||||||
|
current_element = ""
|
||||||
|
|
||||||
# Get surname (last word)
|
# Return surnames above threshold, sorted by count
|
||||||
words = content.split()
|
return [
|
||||||
if words:
|
|
||||||
surname = words[-1]
|
|
||||||
# Clean entities
|
|
||||||
surname = entity_pattern.sub("", surname)
|
|
||||||
if surname:
|
|
||||||
surname_counts[surname] += 1
|
|
||||||
|
|
||||||
# Filter and sort results
|
|
||||||
results = [
|
|
||||||
(surname, count)
|
(surname, count)
|
||||||
for surname, count in surname_counts.items()
|
for surname, count in surname_counter.most_common()
|
||||||
if count >= min_count
|
if count >= min_count
|
||||||
]
|
]
|
||||||
results.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
def process_xml_file(filename, min_count=10000):
|
||||||
|
"""Process XML file using ElementTree (more robust)"""
|
||||||
|
surname_counter = Counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Parse XML incrementally for memory efficiency
|
||||||
|
context = ET.iterparse(filename, events=("start", "end"))
|
||||||
|
context = iter(context)
|
||||||
|
event, root = next(context)
|
||||||
|
|
||||||
|
for event, elem in context:
|
||||||
|
if event == "end" and elem.tag in ["author", "editor"]:
|
||||||
|
if elem.text:
|
||||||
|
surname = extract_surname(elem.text)
|
||||||
|
if surname:
|
||||||
|
surname_counter[surname] += 1
|
||||||
|
elem.clear() # Free memory
|
||||||
|
|
||||||
|
except ET.ParseError:
|
||||||
|
# Fallback to line-by-line processing
|
||||||
|
print(
|
||||||
|
"XML parsing failed, falling back to regex processing...", file=sys.stderr
|
||||||
|
)
|
||||||
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
|
return process_xml_stream(f, min_count)
|
||||||
|
|
||||||
|
return [
|
||||||
|
(surname, count)
|
||||||
|
for surname, count in surname_counter.most_common()
|
||||||
|
if count >= min_count
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
min_count = (
|
min_count = 10000
|
||||||
int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(sys.argv) > 1 and not sys.argv[1].isdigit():
|
# Handle command line arguments
|
||||||
# Read from file
|
if len(sys.argv) > 1:
|
||||||
with open(sys.argv[1], "r", encoding="utf-8") as f:
|
if sys.argv[1] in ["-h", "--help"]:
|
||||||
results = fast_extract_surnames(f, min_count)
|
print(__doc__)
|
||||||
|
sys.exit(0)
|
||||||
|
elif sys.argv[1].isdigit():
|
||||||
|
min_count = int(sys.argv[1])
|
||||||
|
filename = sys.argv[2] if len(sys.argv) > 2 else None
|
||||||
|
else:
|
||||||
|
filename = sys.argv[1]
|
||||||
|
min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
|
||||||
else:
|
else:
|
||||||
# Read from stdin
|
filename = None
|
||||||
results = fast_extract_surnames(sys.stdin, min_count)
|
|
||||||
|
|
||||||
for surname, count in results:
|
try:
|
||||||
print(f"{surname} {count}")
|
if filename:
|
||||||
|
# Read from file
|
||||||
|
results = process_xml_file(filename, min_count)
|
||||||
|
else:
|
||||||
|
# Read from stdin (piped input)
|
||||||
|
results = process_xml_stream(sys.stdin, min_count)
|
||||||
|
|
||||||
|
# Output results in same format as C program
|
||||||
|
for surname, count in results:
|
||||||
|
print(f"{surname} {count}")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@@ -1,176 +1,45 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# surnames_validation.sh
|
# simple_surnames_validation.sh - Robust and simple
|
||||||
# Comprehensive validation script for DBLP surname extraction
|
|
||||||
|
|
||||||
set -e # Exit on any error
|
|
||||||
|
|
||||||
DBLP_FILE="${1:-dblp.xml.gz}"
|
DBLP_FILE="${1:-dblp.xml.gz}"
|
||||||
MIN_COUNT="${2:-10000}"
|
TEMP_DIR="/tmp/surnames_val_$$"
|
||||||
TEMP_DIR="/tmp/surnames_validation_$$"
|
|
||||||
|
|
||||||
# Colors for output
|
echo "=== Simple Surname Validation ==="
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
echo -e "${BLUE}=== DBLP Surname Extraction Validation ===${NC}"
|
|
||||||
echo "Input: $DBLP_FILE"
|
echo "Input: $DBLP_FILE"
|
||||||
echo "Minimum count: $MIN_COUNT"
|
|
||||||
echo "Temp directory: $TEMP_DIR"
|
|
||||||
|
|
||||||
# Create temp directory
|
|
||||||
mkdir -p "$TEMP_DIR"
|
mkdir -p "$TEMP_DIR"
|
||||||
|
|
||||||
# Function to cleanup
|
# Sequential execution to avoid pipe issues
|
||||||
cleanup() {
|
echo "Running C program..."
|
||||||
echo -e "\n${YELLOW}Cleaning up...${NC}"
|
gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
|
||||||
rm -rf "$TEMP_DIR"
|
c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
|
||||||
}
|
c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
# Check if files exist
|
echo "Running Python validation..."
|
||||||
if [[ ! -f "surnames" ]]; then
|
gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
|
||||||
echo -e "${RED}Error: C program 'surnames' not found!${NC}"
|
python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
|
||||||
exit 1
|
python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -f "surnames.py" ]]; then
|
echo "Results:"
|
||||||
echo -e "${RED}Error: Python script 'surnames.py' not found!${NC}"
|
echo " C program: $c_count surnames"
|
||||||
exit 1
|
echo " Python script: $python_count surnames"
|
||||||
fi
|
echo " C top result: $c_wang"
|
||||||
|
echo " Python top result: $python_wang"
|
||||||
|
|
||||||
if [[ ! -f "$DBLP_FILE" ]]; then
|
if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
|
||||||
echo -e "${RED}Error: DBLP file '$DBLP_FILE' not found!${NC}"
|
echo "✓ Results are identical!"
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo -e "\n${BLUE}=== Memory Leak Testing (Valgrind) ===${NC}"
|
|
||||||
echo "Running valgrind on C program..."
|
|
||||||
if command -v valgrind &> /dev/null; then
|
|
||||||
gunzip -c "$DBLP_FILE" | valgrind --leak-check=full --show-leak-kinds=all \
|
|
||||||
--track-origins=yes --verbose --log-file="$TEMP_DIR/valgrind.log" \
|
|
||||||
./surnames > "$TEMP_DIR/c_results_valgrind.txt" 2>&1
|
|
||||||
|
|
||||||
echo "Valgrind results:"
|
|
||||||
if grep -q "All heap blocks were freed" "$TEMP_DIR/valgrind.log"; then
|
|
||||||
echo -e "${GREEN}✓ No memory leaks detected${NC}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}⚠ Potential memory issues detected${NC}"
|
|
||||||
grep -E "(definitely lost|possibly lost|still reachable)" "$TEMP_DIR/valgrind.log" | head -5
|
|
||||||
fi
|
|
||||||
echo "Full valgrind log: $TEMP_DIR/valgrind.log"
|
|
||||||
else
|
else
|
||||||
echo -e "${YELLOW}Valgrind not available, skipping memory check${NC}"
|
echo "⚠ Results differ - checking details..."
|
||||||
|
diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo -e "\n${BLUE}=== Performance Profiling ===${NC}"
|
echo "Memory check..."
|
||||||
|
echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
|
||||||
# C program performance
|
if [ $? -eq 0 ]; then
|
||||||
echo "Profiling C program..."
|
echo "✓ No memory leaks detected"
|
||||||
if command -v perf &> /dev/null; then
|
|
||||||
echo "Using perf for detailed profiling..."
|
|
||||||
gunzip -c "$DBLP_FILE" | perf stat -d ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_perf.log"
|
|
||||||
echo "Perf results:"
|
|
||||||
grep -E "(task-clock|cycles|instructions|cache-misses)" "$TEMP_DIR/c_perf.log" | head -4
|
|
||||||
else
|
|
||||||
echo "Using time for basic profiling..."
|
|
||||||
gunzip -c "$DBLP_FILE" | /usr/bin/time -v ./surnames > "$TEMP_DIR/c_results.txt" 2> "$TEMP_DIR/c_time.log"
|
|
||||||
echo "Time results:"
|
|
||||||
grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/c_time.log"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Python program performance
|
# Cleanup
|
||||||
echo -e "\nProfiling Python program..."
|
rm -rf "$TEMP_DIR"
|
||||||
gunzip -c "$DBLP_FILE" | /usr/bin/time -v python3 surnames.py > "$TEMP_DIR/python_results.txt" 2> "$TEMP_DIR/python_time.log"
|
echo "Validation complete!"
|
||||||
echo "Python time results:"
|
|
||||||
grep -E "(Elapsed|Maximum resident|Page faults)" "$TEMP_DIR/python_time.log"
|
|
||||||
|
|
||||||
echo -e "\n${BLUE}=== Output Validation ===${NC}"
|
|
||||||
|
|
||||||
# Compare line counts
|
|
||||||
c_lines=$(wc -l < "$TEMP_DIR/c_results.txt")
|
|
||||||
python_lines=$(wc -l < "$TEMP_DIR/python_results.txt")
|
|
||||||
|
|
||||||
echo "Result counts:"
|
|
||||||
echo " C program: $c_lines surnames"
|
|
||||||
echo " Python script: $python_lines surnames"
|
|
||||||
|
|
||||||
if [[ $c_lines -eq $python_lines ]]; then
|
|
||||||
echo -e "${GREEN}✓ Line counts match${NC}"
|
|
||||||
else
|
|
||||||
echo -e "${YELLOW}⚠ Line counts differ${NC}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Compare top 10 results
|
|
||||||
echo -e "\nTop 10 comparison:"
|
|
||||||
echo -e "${BLUE}C Program:${NC}"
|
|
||||||
head -10 "$TEMP_DIR/c_results.txt"
|
|
||||||
echo -e "${BLUE}Python Script:${NC}"
|
|
||||||
head -10 "$TEMP_DIR/python_results.txt"
|
|
||||||
|
|
||||||
# Detailed difference analysis
|
|
||||||
echo -e "\n${BLUE}=== Detailed Difference Analysis ===${NC}"
|
|
||||||
if command -v diff &> /dev/null; then
|
|
||||||
echo "Running diff analysis..."
|
|
||||||
if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
|
|
||||||
echo -e "${GREEN}✓ Results are identical!${NC}"
|
|
||||||
else
|
|
||||||
echo -e "${YELLOW}⚠ Found differences:${NC}"
|
|
||||||
diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -20
|
|
||||||
|
|
||||||
# Count differences
|
|
||||||
diff_count=$(diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | wc -l)
|
|
||||||
echo "Total difference lines: $diff_count"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "diff not available, using comm..."
|
|
||||||
comm -3 <(sort "$TEMP_DIR/c_results.txt") <(sort "$TEMP_DIR/python_results.txt") | head -10
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Specific validation checks
|
|
||||||
echo -e "\n${BLUE}=== Specific Validation Checks ===${NC}"
|
|
||||||
|
|
||||||
# Check Wang count (most frequent)
|
|
||||||
c_wang=$(grep "^Wang " "$TEMP_DIR/c_results.txt" | awk '{print $2}')
|
|
||||||
python_wang=$(grep "^Wang " "$TEMP_DIR/python_results.txt" | awk '{print $2}')
|
|
||||||
|
|
||||||
echo "Wang frequency check:"
|
|
||||||
echo " C: $c_wang"
|
|
||||||
echo " Python: $python_wang"
|
|
||||||
if [[ "$c_wang" == "$python_wang" ]]; then
|
|
||||||
echo -e "${GREEN}✓ Wang counts match${NC}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}✗ Wang counts differ${NC}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check for duplicates in C results
|
|
||||||
echo -e "\nDuplicate check (C program):"
|
|
||||||
duplicates=$(awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | wc -l)
|
|
||||||
if [[ $duplicates -eq 0 ]]; then
|
|
||||||
echo -e "${GREEN}✓ No duplicates found${NC}"
|
|
||||||
else
|
|
||||||
echo -e "${RED}✗ Found $duplicates duplicate surnames${NC}"
|
|
||||||
awk '{print $1}' "$TEMP_DIR/c_results.txt" | sort | uniq -d | head -5
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Performance comparison
|
|
||||||
echo -e "\n${BLUE}=== Performance Summary ===${NC}"
|
|
||||||
c_time=$(grep "Elapsed" "$TEMP_DIR/c_time.log" | awk '{print $8}' | head -1)
|
|
||||||
python_time=$(grep "Elapsed" "$TEMP_DIR/python_time.log" | awk '{print $8}' | head -1)
|
|
||||||
|
|
||||||
echo "Execution times:"
|
|
||||||
echo " C program: $c_time"
|
|
||||||
echo " Python script: $python_time"
|
|
||||||
|
|
||||||
# Save results for later analysis
|
|
||||||
echo -e "\n${BLUE}=== Results Saved ===${NC}"
|
|
||||||
echo "Results saved in: $TEMP_DIR"
|
|
||||||
echo " C results: $TEMP_DIR/c_results.txt"
|
|
||||||
echo " Python results: $TEMP_DIR/python_results.txt"
|
|
||||||
echo " Valgrind log: $TEMP_DIR/valgrind.log"
|
|
||||||
echo " Performance logs: $TEMP_DIR/*_time.log"
|
|
||||||
|
|
||||||
echo -e "\n${GREEN}Validation complete!${NC}"
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user