diff --git a/main.c b/main.c index ece6c8c..116f544 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #define BUFFER_LENGTH 128 #define LINE_LENGTH 1024 #define HASH_BUCKETS 4000037 -#define MIN_COUNT 10000 +#define MIN_COUNT 0 void string_ncopy(char *dest, const char *src, size_t max_len) { size_t i = 0; @@ -30,35 +30,34 @@ void newPerson(person *p, const char *name) { } void sorted_name_insert(person **head, char *name) { + person *p = *head; + while (p != NULL) { + if (strcmp(p->name, name) == 0) { + p->count++; + return; + } + p = p->next; + } + person *node = (person *) malloc(sizeof(person)); newPerson(node, name); - if (*head == NULL) { + + if (*head == NULL || strcmp((*head)->name, name) > 0) { + node->next = *head; *head = node; - } else { - person *p = *head; - person *p_prev = NULL; - int cmp = strcmp(p->name, name); - while (p->next != NULL && cmp < 0) { - p_prev = p; - p = p->next; - cmp = strcmp(p->name, name); - } - if (cmp == 0){ - p->count++; - free(node); - }else if (p_prev == NULL) { - node->next = *head; - *head = node; - } else if (p->next != NULL && cmp < 0) { - node->next = p; - p_prev->next = node; - } else { - p->next = node; - node->next = NULL; - } + return; } + + p = *head; + while (p->next != NULL && strcmp(p->next->name, name) < 0) { + p = p->next; + } + + node->next = p->next; + p->next = node; } + void sorted_count_insert(person **head, person *node) { if (*head == NULL) { *head = node; @@ -126,25 +125,21 @@ void parse_line(char *line, char *buffer) { line_it++; } line_it++; - char *surname_end = line_it, *surname_start = line_it; + char *content_start = line_it, *last_space = NULL, *second_last_space = NULL; while (*line_it && *line_it != '<') { if (*line_it == ' ') { - surname_start = surname_end; - surname_end = line_it; + second_last_space = last_space; + last_space = line_it; } line_it++; } - bool only_numbers = true; - char *c = surname_end; - while (only_numbers && c != line_it) { - if (!isdigit(*c)) { - only_numbers = false; - } - c++; - } + char *surname_start, *surname_end; - if (!only_numbers) { - surname_start = surname_end; + if (last_space && isdigit(*(last_space+1))) { + surname_start = second_last_space ? second_last_space + 1 : content_start; + surname_end = last_space ? last_space : line_it; + } else { + surname_start = last_space ? last_space + 1 : content_start; surname_end = line_it; } size_t name_length = surname_end - surname_start; @@ -175,6 +170,30 @@ void make_list(person **hashmap, person **list, const int min_count) { free(hashmap); } +void verify_list(person *list, person *test) { + person *p = list; + int count = 0; + while (p != NULL) { + person *next = p->next; + if (strcmp(p->name, test->name) == 0) { + count++; + } + p = next; + } + if (count > 1) { + printf("ERROR: %s\n", test->name); + } +} + +void check_list(person *list) { + person *p = list; + while (p != NULL) { + person *next = p->next; + verify_list(list, p); + p = next; + } +} + void clean_list(person *list) { person *p = list; while (p != NULL) { @@ -202,8 +221,6 @@ int main(void) { hashmap[i] = NULL; } FILE *fp = fopen("dblp.xml", "r"); - // char *line = malloc(sizeof(char) * LINE_LENGTH); - // size_t line_len = LINE_LENGTH; char *line = NULL; size_t line_len = 0; char *buffer = (char *) malloc(sizeof(char) * BUFFER_LENGTH); @@ -227,10 +244,10 @@ int main(void) { } free(line); free(buffer); - printf("Done parsing!\n"); person *list = NULL; make_list(hashmap, &list, MIN_COUNT); + check_list(list); display(list); clean_list(list); return 0; -} \ No newline at end of file +} diff --git a/validate_extraction.sh b/validate_extraction.sh new file mode 100755 index 0000000..c40da3b --- /dev/null +++ b/validate_extraction.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +echo "Running full validation with working pipeline..." + +# Generate complete bash results using the proven pipeline +echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..." +{ + grep -E "<(author|editor)" dblp.xml | \ + sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \ + awk '{ + if ($NF ~ /^[0-9]{4}$/) { NF-- } + if (NF > 0) { + surname = $NF + gsub(/&[^;]*;/, "", surname) + if (length(surname) > 0) print surname + } + }' | \ + sort | uniq -c | \ + awk '$1 >= 10000 {print $2, $1}' | \ + sort -k2 -nr +} > bash_results.txt + +echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames" + +# Get your C program results +./main > c_results.txt + +echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames" + +# Quick comparison of top entries +echo "Top 5 comparison:" +echo "=== C Program ===" +head -5 c_results.txt +echo "=== Bash Script ===" +head -5 bash_results.txt + +# Check Wang specifically +echo "Wang comparison:" +echo "C: $(grep "^Wang " c_results.txt)" +echo "Bash: $(grep "^Wang " bash_results.txt)" +