deleted bash script for now
This commit is contained in:
@@ -8,6 +8,10 @@
|
|||||||
#define HASH_BUCKETS 4000037
|
#define HASH_BUCKETS 4000037
|
||||||
#define MIN_COUNT 10000
|
#define MIN_COUNT 10000
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Usage: pipe dblp.xml to the programm or have it in the same folder as the program
|
||||||
|
*/
|
||||||
|
|
||||||
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while (i < max_len - 1 && src[i]) {
|
while (i < max_len - 1 && src[i]) {
|
||||||
@@ -170,30 +174,6 @@ void make_list(person **hashmap, person **list, const int min_count) {
|
|||||||
free(hashmap);
|
free(hashmap);
|
||||||
}
|
}
|
||||||
|
|
||||||
void verify_list(person *list, person *test) {
|
|
||||||
person *p = list;
|
|
||||||
int count = 0;
|
|
||||||
while (p != NULL) {
|
|
||||||
person *next = p->next;
|
|
||||||
if (strcmp(p->name, test->name) == 0) {
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
p = next;
|
|
||||||
}
|
|
||||||
if (count > 1) {
|
|
||||||
printf("ERROR: %s\n", test->name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void check_list(person *list) {
|
|
||||||
person *p = list;
|
|
||||||
while (p != NULL) {
|
|
||||||
person *next = p->next;
|
|
||||||
verify_list(list, p);
|
|
||||||
p = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void clean_list(person *list) {
|
void clean_list(person *list) {
|
||||||
person *p = list;
|
person *p = list;
|
||||||
while (p != NULL) {
|
while (p != NULL) {
|
||||||
@@ -246,7 +226,6 @@ int main(void) {
|
|||||||
free(buffer);
|
free(buffer);
|
||||||
person *list = NULL;
|
person *list = NULL;
|
||||||
make_list(hashmap, &list, MIN_COUNT);
|
make_list(hashmap, &list, MIN_COUNT);
|
||||||
check_list(list);
|
|
||||||
display(list);
|
display(list);
|
||||||
clean_list(list);
|
clean_list(list);
|
||||||
return 0;
|
return 0;
|
65
surnames_fast.py
Normal file
65
surnames_fast.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fast DBLP Surname Extractor - Optimized for 4GB+ files
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
def fast_extract_surnames(input_stream, min_count=10000):
|
||||||
|
"""Memory-efficient surname extraction"""
|
||||||
|
surname_counts = defaultdict(int)
|
||||||
|
|
||||||
|
# Compile regex for performance
|
||||||
|
tag_pattern = re.compile(r"<(author|editor)[^>]*>([^<]+)</(author|editor)>")
|
||||||
|
year_pattern = re.compile(r"\s+\d{4}$")
|
||||||
|
entity_pattern = re.compile(r"&[^;]*;")
|
||||||
|
|
||||||
|
for line in input_stream:
|
||||||
|
for match in tag_pattern.finditer(line):
|
||||||
|
content = match.group(2).strip()
|
||||||
|
|
||||||
|
# Remove year suffix
|
||||||
|
content = year_pattern.sub("", content)
|
||||||
|
|
||||||
|
# Get surname (last word)
|
||||||
|
words = content.split()
|
||||||
|
if words:
|
||||||
|
surname = words[-1]
|
||||||
|
# Clean entities
|
||||||
|
surname = entity_pattern.sub("", surname)
|
||||||
|
if surname:
|
||||||
|
surname_counts[surname] += 1
|
||||||
|
|
||||||
|
# Filter and sort results
|
||||||
|
results = [
|
||||||
|
(surname, count)
|
||||||
|
for surname, count in surname_counts.items()
|
||||||
|
if count >= min_count
|
||||||
|
]
|
||||||
|
results.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
min_count = (
|
||||||
|
int(sys.argv[1]) if len(sys.argv) > 1 and sys.argv[1].isdigit() else 10000
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(sys.argv) > 1 and not sys.argv[1].isdigit():
|
||||||
|
# Read from file
|
||||||
|
with open(sys.argv[1], "r", encoding="utf-8") as f:
|
||||||
|
results = fast_extract_surnames(f, min_count)
|
||||||
|
else:
|
||||||
|
# Read from stdin
|
||||||
|
results = fast_extract_surnames(sys.stdin, min_count)
|
||||||
|
|
||||||
|
for surname, count in results:
|
||||||
|
print(f"{surname} {count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@@ -1,41 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
echo "Running full validation with working pipeline..."
|
|
||||||
|
|
||||||
# Generate complete bash results using the proven pipeline
|
|
||||||
echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
|
|
||||||
{
|
|
||||||
grep -E "<(author|editor)" dblp.xml | \
|
|
||||||
sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
|
|
||||||
awk '{
|
|
||||||
if ($NF ~ /^[0-9]{4}$/) { NF-- }
|
|
||||||
if (NF > 0) {
|
|
||||||
surname = $NF
|
|
||||||
gsub(/&[^;]*;/, "", surname)
|
|
||||||
if (length(surname) > 0) print surname
|
|
||||||
}
|
|
||||||
}' | \
|
|
||||||
sort | uniq -c | \
|
|
||||||
awk '$1 >= 10000 {print $2, $1}' | \
|
|
||||||
sort -k2 -nr
|
|
||||||
} > bash_results.txt
|
|
||||||
|
|
||||||
echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
|
|
||||||
|
|
||||||
# Get your C program results
|
|
||||||
./main > c_results.txt
|
|
||||||
|
|
||||||
echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
|
|
||||||
|
|
||||||
# Quick comparison of top entries
|
|
||||||
echo "Top 5 comparison:"
|
|
||||||
echo "=== C Program ==="
|
|
||||||
head -5 c_results.txt
|
|
||||||
echo "=== Bash Script ==="
|
|
||||||
head -5 bash_results.txt
|
|
||||||
|
|
||||||
# Check Wang specifically
|
|
||||||
echo "Wang comparison:"
|
|
||||||
echo "C: $(grep "^Wang " c_results.txt)"
|
|
||||||
echo "Bash: $(grep "^Wang " bash_results.txt)"
|
|
||||||
|
|
Reference in New Issue
Block a user