Compare commits
6 Commits
d9ba23af7b
...
main
Author | SHA1 | Date | |
---|---|---|---|
8e9cd18fa6 | |||
aa6cde55f4 | |||
d4d3dca574 | |||
b669c7135a | |||
d35c253afa | |||
8f65b8c142 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -139,3 +139,6 @@ dkms.conf
|
||||
|
||||
/bash_results.txt
|
||||
/c_results.txt
|
||||
/surnames
|
||||
/c_quick.txt
|
||||
/python_quick.txt
|
||||
|
@@ -8,6 +8,10 @@
|
||||
#define HASH_BUCKETS 4000037
|
||||
#define MIN_COUNT 10000
|
||||
|
||||
/*
|
||||
* Usage: pipe dblp.xml to the program or have it in the same folder as the program
|
||||
*/
|
||||
|
||||
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
||||
size_t i = 0;
|
||||
while (i < max_len - 1 && src[i]) {
|
||||
@@ -23,10 +27,12 @@ typedef struct person {
|
||||
int count;
|
||||
} person;
|
||||
|
||||
void newPerson(person *p, const char *name) {
|
||||
person* newPerson(const char *name) {
|
||||
person *p = (person *) malloc(sizeof(person));
|
||||
string_ncopy(p->name, name, BUFFER_LENGTH);
|
||||
p->count = 1;
|
||||
p->next = NULL;
|
||||
return p;
|
||||
}
|
||||
|
||||
void sorted_name_insert(person **head, char *name) {
|
||||
@@ -39,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
|
||||
p = p->next;
|
||||
}
|
||||
|
||||
person *node = (person *) malloc(sizeof(person));
|
||||
newPerson(node, name);
|
||||
person *node = newPerson(name);
|
||||
|
||||
if (*head == NULL || strcmp((*head)->name, name) > 0) {
|
||||
node->next = *head;
|
||||
@@ -170,30 +175,6 @@ void make_list(person **hashmap, person **list, const int min_count) {
|
||||
free(hashmap);
|
||||
}
|
||||
|
||||
void verify_list(person *list, person *test) {
|
||||
person *p = list;
|
||||
int count = 0;
|
||||
while (p != NULL) {
|
||||
person *next = p->next;
|
||||
if (strcmp(p->name, test->name) == 0) {
|
||||
count++;
|
||||
}
|
||||
p = next;
|
||||
}
|
||||
if (count > 1) {
|
||||
printf("ERROR: %s\n", test->name);
|
||||
}
|
||||
}
|
||||
|
||||
void check_list(person *list) {
|
||||
person *p = list;
|
||||
while (p != NULL) {
|
||||
person *next = p->next;
|
||||
verify_list(list, p);
|
||||
p = next;
|
||||
}
|
||||
}
|
||||
|
||||
void clean_list(person *list) {
|
||||
person *p = list;
|
||||
while (p != NULL) {
|
||||
@@ -246,7 +227,6 @@ int main(void) {
|
||||
free(buffer);
|
||||
person *list = NULL;
|
||||
make_list(hashmap, &list, MIN_COUNT);
|
||||
check_list(list);
|
||||
display(list);
|
||||
clean_list(list);
|
||||
return 0;
|
163
surnames.py
Normal file
163
surnames.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DBLP Surname Extractor
|
||||
Replicates C program logic for surname frequency analysis
|
||||
Usage: gunzip -c dblp.xml.gz | python surnames.py
|
||||
python surnames.py dblp.xml
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def extract_surname(name_text):
|
||||
"""Extract surname using same logic as C program"""
|
||||
if not name_text:
|
||||
return None
|
||||
|
||||
# Split into words
|
||||
words = name_text.strip().split()
|
||||
if not words:
|
||||
return None
|
||||
|
||||
# Remove 4-digit year if present at end
|
||||
if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
|
||||
words = words[:-1]
|
||||
|
||||
if not words:
|
||||
return None
|
||||
|
||||
# Return last word as surname
|
||||
surname = words[-1]
|
||||
|
||||
# Clean up XML entities (basic cleanup)
|
||||
surname = re.sub(r"&[^;]*;", "", surname)
|
||||
|
||||
return surname if surname else None
|
||||
|
||||
|
||||
def process_xml_stream(input_stream, min_count=10000):
|
||||
"""Process XML from stream (for piping)"""
|
||||
surname_counter = Counter()
|
||||
|
||||
# Read and process line by line for memory efficiency
|
||||
current_element = ""
|
||||
in_author_or_editor = False
|
||||
tag_name = ""
|
||||
|
||||
for line in input_stream:
|
||||
line = line.strip()
|
||||
|
||||
# Check for author or editor tags
|
||||
author_match = re.search(
|
||||
r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
|
||||
)
|
||||
if author_match:
|
||||
tag_name = author_match.group(1)
|
||||
content = author_match.group(2)
|
||||
surname = extract_surname(content)
|
||||
if surname:
|
||||
surname_counter[surname] += 1
|
||||
else:
|
||||
# Handle multi-line tags
|
||||
if re.search(r"<(author|editor)", line):
|
||||
in_author_or_editor = True
|
||||
tag_match = re.search(r"<(author|editor)", line)
|
||||
tag_name = tag_match.group(1)
|
||||
current_element = line
|
||||
elif in_author_or_editor:
|
||||
current_element += " " + line
|
||||
if f"</{tag_name}>" in line:
|
||||
# Extract content between tags
|
||||
content_match = re.search(
|
||||
rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
|
||||
)
|
||||
if content_match:
|
||||
content = content_match.group(1)
|
||||
surname = extract_surname(content)
|
||||
if surname:
|
||||
surname_counter[surname] += 1
|
||||
in_author_or_editor = False
|
||||
current_element = ""
|
||||
|
||||
# Return surnames above threshold, sorted by count
|
||||
return [
|
||||
(surname, count)
|
||||
for surname, count in surname_counter.most_common()
|
||||
if count >= min_count
|
||||
]
|
||||
|
||||
|
||||
def process_xml_file(filename, min_count=10000):
|
||||
"""Process XML file using ElementTree (more robust)"""
|
||||
surname_counter = Counter()
|
||||
|
||||
try:
|
||||
# Parse XML incrementally for memory efficiency
|
||||
context = ET.iterparse(filename, events=("start", "end"))
|
||||
context = iter(context)
|
||||
event, root = next(context)
|
||||
|
||||
for event, elem in context:
|
||||
if event == "end" and elem.tag in ["author", "editor"]:
|
||||
if elem.text:
|
||||
surname = extract_surname(elem.text)
|
||||
if surname:
|
||||
surname_counter[surname] += 1
|
||||
elem.clear() # Free memory
|
||||
|
||||
except ET.ParseError:
|
||||
# Fallback to line-by-line processing
|
||||
print(
|
||||
"XML parsing failed, falling back to regex processing...", file=sys.stderr
|
||||
)
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
return process_xml_stream(f, min_count)
|
||||
|
||||
return [
|
||||
(surname, count)
|
||||
for surname, count in surname_counter.most_common()
|
||||
if count >= min_count
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
min_count = 10000
|
||||
|
||||
# Handle command line arguments
|
||||
if len(sys.argv) > 1:
|
||||
if sys.argv[1] in ["-h", "--help"]:
|
||||
print(__doc__)
|
||||
sys.exit(0)
|
||||
elif sys.argv[1].isdigit():
|
||||
min_count = int(sys.argv[1])
|
||||
filename = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
else:
|
||||
filename = sys.argv[1]
|
||||
min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
|
||||
else:
|
||||
filename = None
|
||||
|
||||
try:
|
||||
if filename:
|
||||
# Read from file
|
||||
results = process_xml_file(filename, min_count)
|
||||
else:
|
||||
# Read from stdin (piped input)
|
||||
results = process_xml_stream(sys.stdin, min_count)
|
||||
|
||||
# Output results in same format as C program
|
||||
for surname, count in results:
|
||||
print(f"{surname} {count}")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
38
surnames_quick_validation.sh
Normal file
38
surnames_quick_validation.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
# quick_validation.sh - Simplified version for rapid testing
|
||||
|
||||
echo "=== Quick Surname Validation ==="
|
||||
|
||||
# Quick output comparison
|
||||
echo "Generating results..."
|
||||
gunzip -c dblp.xml.gz | ./surnames > c_quick.txt &
|
||||
C_PID=$!
|
||||
gunzip -c dblp.xml.gz | python3 surnames.py > python_quick.txt &
|
||||
PYTHON_PID=$!
|
||||
|
||||
wait $C_PID $PYTHON_PID
|
||||
|
||||
echo "C results: $(wc -l < c_quick.txt) surnames"
|
||||
echo "Python results: $(wc -l < python_quick.txt) surnames"
|
||||
|
||||
echo "Wang comparison:"
|
||||
echo "C: $(grep "^Wang " c_quick.txt)"
|
||||
echo "Python: $(grep "^Wang " python_quick.txt)"
|
||||
|
||||
if diff -q c_quick.txt python_quick.txt > /dev/null; then
|
||||
echo "✓ Results identical!"
|
||||
else
|
||||
echo "⚠ Results differ"
|
||||
fi
|
||||
|
||||
# Quick memory check
|
||||
echo "Memory check:"
|
||||
gunzip -c dblp.xml.gz | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✓ No major memory issues"
|
||||
else
|
||||
echo "⚠ Check valgrind output"
|
||||
fi
|
||||
|
||||
rm -f c_quick.txt python_quick.txt
|
||||
|
45
surnames_validation.sh
Normal file
45
surnames_validation.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# simple_surnames_validation.sh - Robust and simple
|
||||
|
||||
DBLP_FILE="${1:-dblp.xml.gz}"
|
||||
TEMP_DIR="/tmp/surnames_val_$$"
|
||||
|
||||
echo "=== Simple Surname Validation ==="
|
||||
echo "Input: $DBLP_FILE"
|
||||
|
||||
mkdir -p "$TEMP_DIR"
|
||||
|
||||
# Sequential execution to avoid pipe issues
|
||||
echo "Running C program..."
|
||||
gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
|
||||
c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
|
||||
c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
|
||||
|
||||
echo "Running Python validation..."
|
||||
gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
|
||||
python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
|
||||
python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
|
||||
|
||||
echo "Results:"
|
||||
echo " C program: $c_count surnames"
|
||||
echo " Python script: $python_count surnames"
|
||||
echo " C top result: $c_wang"
|
||||
echo " Python top result: $python_wang"
|
||||
|
||||
if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
|
||||
echo "✓ Results are identical!"
|
||||
else
|
||||
echo "⚠ Results differ - checking details..."
|
||||
diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
|
||||
fi
|
||||
|
||||
echo "Memory check..."
|
||||
echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✓ No memory leaks detected"
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
rm -rf "$TEMP_DIR"
|
||||
echo "Validation complete!"
|
||||
|
@@ -1,41 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Running full validation with working pipeline..."
|
||||
|
||||
# Generate complete bash results using the proven pipeline
|
||||
echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
|
||||
{
|
||||
grep -E "<(author|editor)" dblp.xml | \
|
||||
sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
|
||||
awk '{
|
||||
if ($NF ~ /^[0-9]{4}$/) { NF-- }
|
||||
if (NF > 0) {
|
||||
surname = $NF
|
||||
gsub(/&[^;]*;/, "", surname)
|
||||
if (length(surname) > 0) print surname
|
||||
}
|
||||
}' | \
|
||||
sort | uniq -c | \
|
||||
awk '$1 >= 10000 {print $2, $1}' | \
|
||||
sort -k2 -nr
|
||||
} > bash_results.txt
|
||||
|
||||
echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
|
||||
|
||||
# Get your C program results
|
||||
./main > c_results.txt
|
||||
|
||||
echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
|
||||
|
||||
# Quick comparison of top entries
|
||||
echo "Top 5 comparison:"
|
||||
echo "=== C Program ==="
|
||||
head -5 c_results.txt
|
||||
echo "=== Bash Script ==="
|
||||
head -5 bash_results.txt
|
||||
|
||||
# Check Wang specifically
|
||||
echo "Wang comparison:"
|
||||
echo "C: $(grep "^Wang " c_results.txt)"
|
||||
echo "Bash: $(grep "^Wang " bash_results.txt)"
|
||||
|
Reference in New Issue
Block a user