Compare commits

...

4 Commits

Author SHA1 Message Date
8e9cd18fa6 some small fixes 2025-06-13 05:51:53 +02:00
aa6cde55f4 update gitignore 2025-06-13 05:05:52 +02:00
d4d3dca574 added new bash validation scripts 2025-06-13 05:05:33 +02:00
b669c7135a added py script for validation 2025-06-13 04:58:34 +02:00
6 changed files with 253 additions and 69 deletions

2
.gitignore vendored
View File

@@ -140,3 +140,5 @@ dkms.conf
/bash_results.txt /bash_results.txt
/c_results.txt /c_results.txt
/surnames /surnames
/c_quick.txt
/python_quick.txt

View File

@@ -9,7 +9,7 @@
#define MIN_COUNT 10000 #define MIN_COUNT 10000
/* /*
* Usage: pipe dblp.xml to the programm or have it in the same folder as the program * Usage: pipe dblp.xml to the program or have it in the same folder as the program
*/ */
void string_ncopy(char *dest, const char *src, size_t max_len) { void string_ncopy(char *dest, const char *src, size_t max_len) {
@@ -27,10 +27,12 @@ typedef struct person {
int count; int count;
} person; } person;
void newPerson(person *p, const char *name) { person* newPerson(const char *name) {
person *p = (person *) malloc(sizeof(person));
string_ncopy(p->name, name, BUFFER_LENGTH); string_ncopy(p->name, name, BUFFER_LENGTH);
p->count = 1; p->count = 1;
p->next = NULL; p->next = NULL;
return p;
} }
void sorted_name_insert(person **head, char *name) { void sorted_name_insert(person **head, char *name) {
@@ -43,8 +45,7 @@ void sorted_name_insert(person **head, char *name) {
p = p->next; p = p->next;
} }
person *node = (person *) malloc(sizeof(person)); person *node = newPerson(name);
newPerson(node, name);
if (*head == NULL || strcmp((*head)->name, name) > 0) { if (*head == NULL || strcmp((*head)->name, name) > 0) {
node->next = *head; node->next = *head;

163
surnames.py Normal file
View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""
DBLP Surname Extractor
Replicates C program logic for surname frequency analysis
Usage: gunzip -c dblp.xml.gz | python surnames.py
python surnames.py dblp.xml
"""
import re
import sys
import xml.etree.ElementTree as ET
from collections import Counter
def extract_surname(name_text):
"""Extract surname using same logic as C program"""
if not name_text:
return None
# Split into words
words = name_text.strip().split()
if not words:
return None
# Remove 4-digit year if present at end
if len(words) > 1 and re.match(r"^\d{4}$", words[-1]):
words = words[:-1]
if not words:
return None
# Return last word as surname
surname = words[-1]
# Clean up XML entities (basic cleanup)
surname = re.sub(r"&[^;]*;", "", surname)
return surname if surname else None
def process_xml_stream(input_stream, min_count=10000):
"""Process XML from stream (for piping)"""
surname_counter = Counter()
# Read and process line by line for memory efficiency
current_element = ""
in_author_or_editor = False
tag_name = ""
for line in input_stream:
line = line.strip()
# Check for author or editor tags
author_match = re.search(
r"<(author|editor)[^>]*>([^<]+)</(author|editor)>", line
)
if author_match:
tag_name = author_match.group(1)
content = author_match.group(2)
surname = extract_surname(content)
if surname:
surname_counter[surname] += 1
else:
# Handle multi-line tags
if re.search(r"<(author|editor)", line):
in_author_or_editor = True
tag_match = re.search(r"<(author|editor)", line)
tag_name = tag_match.group(1)
current_element = line
elif in_author_or_editor:
current_element += " " + line
if f"</{tag_name}>" in line:
# Extract content between tags
content_match = re.search(
rf"<{tag_name}[^>]*>([^<]+)</{tag_name}>", current_element
)
if content_match:
content = content_match.group(1)
surname = extract_surname(content)
if surname:
surname_counter[surname] += 1
in_author_or_editor = False
current_element = ""
# Return surnames above threshold, sorted by count
return [
(surname, count)
for surname, count in surname_counter.most_common()
if count >= min_count
]
def process_xml_file(filename, min_count=10000):
"""Process XML file using ElementTree (more robust)"""
surname_counter = Counter()
try:
# Parse XML incrementally for memory efficiency
context = ET.iterparse(filename, events=("start", "end"))
context = iter(context)
event, root = next(context)
for event, elem in context:
if event == "end" and elem.tag in ["author", "editor"]:
if elem.text:
surname = extract_surname(elem.text)
if surname:
surname_counter[surname] += 1
elem.clear() # Free memory
except ET.ParseError:
# Fallback to line-by-line processing
print(
"XML parsing failed, falling back to regex processing...", file=sys.stderr
)
with open(filename, "r", encoding="utf-8") as f:
return process_xml_stream(f, min_count)
return [
(surname, count)
for surname, count in surname_counter.most_common()
if count >= min_count
]
def main():
min_count = 10000
# Handle command line arguments
if len(sys.argv) > 1:
if sys.argv[1] in ["-h", "--help"]:
print(__doc__)
sys.exit(0)
elif sys.argv[1].isdigit():
min_count = int(sys.argv[1])
filename = sys.argv[2] if len(sys.argv) > 2 else None
else:
filename = sys.argv[1]
min_count = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
else:
filename = None
try:
if filename:
# Read from file
results = process_xml_file(filename, min_count)
else:
# Read from stdin (piped input)
results = process_xml_stream(sys.stdin, min_count)
# Output results in same format as C program
for surname, count in results:
print(f"{surname} {count}")
except KeyboardInterrupt:
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,65 +0,0 @@
#!/usr/bin/env python3
"""
Fast DBLP Surname Extractor - Optimized for 4GB+ files
"""
import sys
import re
from collections import defaultdict
def fast_extract_surnames(input_stream, min_count=10000):
"""Memory-efficient surname extraction"""
surname_counts = defaultdict(int)
# Compile regex for performance
tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
year_pattern = re.compile(r"\s+\d{4}$")
entity_pattern = re.compile(r"&[^;]*;")
for line in input_stream:
for match in tag_pattern.finditer(line):
content = match.group(2).strip()
# Remove year suffix
content = year_pattern.sub("", content)
# Get surname (last word)
words = content.split()
if words:
surname = words[-1]
# Clean entities
surname = entity_pattern.sub("", surname)
if surname:
surname_counts[surname] += 1
# Filter and sort results
results = [
(surname, count)
for surname, count in surname_counts.items()
if count >= min_count
]
results.sort(key=lambda x: x[1], reverse=True)
return results
def main():
min_count = (
int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
)
if len(sys.argv) > 1 and not sys.argv[1].isdigit():
# Read from file
with open(sys.argv[1], "r", encoding="utf-8") as f:
results = fast_extract_surnames(f, min_count)
else:
# Read from stdin
results = fast_extract_surnames(sys.stdin, min_count)
for surname, count in results:
print(f"{surname} {count}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,38 @@
#!/bin/bash
# quick_validation.sh - Simplified version for rapid testing
echo "=== Quick Surname Validation ==="
# Quick output comparison
echo "Generating results..."
gunzip -c dblp.xml.gz | ./surnames > c_quick.txt &
C_PID=$!
gunzip -c dblp.xml.gz | python3 surnames.py > python_quick.txt &
PYTHON_PID=$!
wait $C_PID $PYTHON_PID
echo "C results: $(wc -l < c_quick.txt) surnames"
echo "Python results: $(wc -l < python_quick.txt) surnames"
echo "Wang comparison:"
echo "C: $(grep "^Wang " c_quick.txt)"
echo "Python: $(grep "^Wang " python_quick.txt)"
if diff -q c_quick.txt python_quick.txt > /dev/null; then
echo "✓ Results identical!"
else
echo "⚠ Results differ"
fi
# Quick memory check
echo "Memory check:"
gunzip -c dblp.xml.gz | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ No major memory issues"
else
echo "⚠ Check valgrind output"
fi
rm -f c_quick.txt python_quick.txt

45
surnames_validation.sh Normal file
View File

@@ -0,0 +1,45 @@
#!/bin/bash
# simple_surnames_validation.sh - Robust and simple
DBLP_FILE="${1:-dblp.xml.gz}"
TEMP_DIR="/tmp/surnames_val_$$"
echo "=== Simple Surname Validation ==="
echo "Input: $DBLP_FILE"
mkdir -p "$TEMP_DIR"
# Sequential execution to avoid pipe issues
echo "Running C program..."
gunzip -c "$DBLP_FILE" | ./surnames > "$TEMP_DIR/c_results.txt"
c_count=$(wc -l < "$TEMP_DIR/c_results.txt")
c_wang=$(head -1 "$TEMP_DIR/c_results.txt")
echo "Running Python validation..."
gunzip -c "$DBLP_FILE" | python3 surnames.py > "$TEMP_DIR/python_results.txt"
python_count=$(wc -l < "$TEMP_DIR/python_results.txt")
python_wang=$(head -1 "$TEMP_DIR/python_results.txt")
echo "Results:"
echo " C program: $c_count surnames"
echo " Python script: $python_count surnames"
echo " C top result: $c_wang"
echo " Python top result: $python_wang"
if diff -q "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" > /dev/null; then
echo "✓ Results are identical!"
else
echo "⚠ Results differ - checking details..."
diff "$TEMP_DIR/c_results.txt" "$TEMP_DIR/python_results.txt" | head -10
fi
echo "Memory check..."
echo "test" | valgrind --leak-check=yes ./surnames > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "✓ No memory leaks detected"
fi
# Cleanup
rm -rf "$TEMP_DIR"
echo "Validation complete!"