added a validation script and fixed some bugs with sorted insert
This commit is contained in:
97
main.c
97
main.c
@@ -6,7 +6,7 @@
|
|||||||
#define BUFFER_LENGTH 128
|
#define BUFFER_LENGTH 128
|
||||||
#define LINE_LENGTH 1024
|
#define LINE_LENGTH 1024
|
||||||
#define HASH_BUCKETS 4000037
|
#define HASH_BUCKETS 4000037
|
||||||
#define MIN_COUNT 10000
|
#define MIN_COUNT 0
|
||||||
|
|
||||||
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
void string_ncopy(char *dest, const char *src, size_t max_len) {
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
@@ -30,35 +30,34 @@ void newPerson(person *p, const char *name) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void sorted_name_insert(person **head, char *name) {
|
void sorted_name_insert(person **head, char *name) {
|
||||||
|
person *p = *head;
|
||||||
|
while (p != NULL) {
|
||||||
|
if (strcmp(p->name, name) == 0) {
|
||||||
|
p->count++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
p = p->next;
|
||||||
|
}
|
||||||
|
|
||||||
person *node = (person *) malloc(sizeof(person));
|
person *node = (person *) malloc(sizeof(person));
|
||||||
newPerson(node, name);
|
newPerson(node, name);
|
||||||
if (*head == NULL) {
|
|
||||||
|
if (*head == NULL || strcmp((*head)->name, name) > 0) {
|
||||||
|
node->next = *head;
|
||||||
*head = node;
|
*head = node;
|
||||||
} else {
|
return;
|
||||||
person *p = *head;
|
|
||||||
person *p_prev = NULL;
|
|
||||||
int cmp = strcmp(p->name, name);
|
|
||||||
while (p->next != NULL && cmp < 0) {
|
|
||||||
p_prev = p;
|
|
||||||
p = p->next;
|
|
||||||
cmp = strcmp(p->name, name);
|
|
||||||
}
|
|
||||||
if (cmp == 0){
|
|
||||||
p->count++;
|
|
||||||
free(node);
|
|
||||||
}else if (p_prev == NULL) {
|
|
||||||
node->next = *head;
|
|
||||||
*head = node;
|
|
||||||
} else if (p->next != NULL && cmp < 0) {
|
|
||||||
node->next = p;
|
|
||||||
p_prev->next = node;
|
|
||||||
} else {
|
|
||||||
p->next = node;
|
|
||||||
node->next = NULL;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
p = *head;
|
||||||
|
while (p->next != NULL && strcmp(p->next->name, name) < 0) {
|
||||||
|
p = p->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
node->next = p->next;
|
||||||
|
p->next = node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void sorted_count_insert(person **head, person *node) {
|
void sorted_count_insert(person **head, person *node) {
|
||||||
if (*head == NULL) {
|
if (*head == NULL) {
|
||||||
*head = node;
|
*head = node;
|
||||||
@@ -126,25 +125,21 @@ void parse_line(char *line, char *buffer) {
|
|||||||
line_it++;
|
line_it++;
|
||||||
}
|
}
|
||||||
line_it++;
|
line_it++;
|
||||||
char *surname_end = line_it, *surname_start = line_it;
|
char *content_start = line_it, *last_space = NULL, *second_last_space = NULL;
|
||||||
while (*line_it && *line_it != '<') {
|
while (*line_it && *line_it != '<') {
|
||||||
if (*line_it == ' ') {
|
if (*line_it == ' ') {
|
||||||
surname_start = surname_end;
|
second_last_space = last_space;
|
||||||
surname_end = line_it;
|
last_space = line_it;
|
||||||
}
|
}
|
||||||
line_it++;
|
line_it++;
|
||||||
}
|
}
|
||||||
bool only_numbers = true;
|
char *surname_start, *surname_end;
|
||||||
char *c = surname_end;
|
|
||||||
while (only_numbers && c != line_it) {
|
|
||||||
if (!isdigit(*c)) {
|
|
||||||
only_numbers = false;
|
|
||||||
}
|
|
||||||
c++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!only_numbers) {
|
if (last_space && isdigit(*(last_space+1))) {
|
||||||
surname_start = surname_end;
|
surname_start = second_last_space ? second_last_space + 1 : content_start;
|
||||||
|
surname_end = last_space ? last_space : line_it;
|
||||||
|
} else {
|
||||||
|
surname_start = last_space ? last_space + 1 : content_start;
|
||||||
surname_end = line_it;
|
surname_end = line_it;
|
||||||
}
|
}
|
||||||
size_t name_length = surname_end - surname_start;
|
size_t name_length = surname_end - surname_start;
|
||||||
@@ -175,6 +170,30 @@ void make_list(person **hashmap, person **list, const int min_count) {
|
|||||||
free(hashmap);
|
free(hashmap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void verify_list(person *list, person *test) {
|
||||||
|
person *p = list;
|
||||||
|
int count = 0;
|
||||||
|
while (p != NULL) {
|
||||||
|
person *next = p->next;
|
||||||
|
if (strcmp(p->name, test->name) == 0) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
p = next;
|
||||||
|
}
|
||||||
|
if (count > 1) {
|
||||||
|
printf("ERROR: %s\n", test->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_list(person *list) {
|
||||||
|
person *p = list;
|
||||||
|
while (p != NULL) {
|
||||||
|
person *next = p->next;
|
||||||
|
verify_list(list, p);
|
||||||
|
p = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void clean_list(person *list) {
|
void clean_list(person *list) {
|
||||||
person *p = list;
|
person *p = list;
|
||||||
while (p != NULL) {
|
while (p != NULL) {
|
||||||
@@ -202,8 +221,6 @@ int main(void) {
|
|||||||
hashmap[i] = NULL;
|
hashmap[i] = NULL;
|
||||||
}
|
}
|
||||||
FILE *fp = fopen("dblp.xml", "r");
|
FILE *fp = fopen("dblp.xml", "r");
|
||||||
// char *line = malloc(sizeof(char) * LINE_LENGTH);
|
|
||||||
// size_t line_len = LINE_LENGTH;
|
|
||||||
char *line = NULL;
|
char *line = NULL;
|
||||||
size_t line_len = 0;
|
size_t line_len = 0;
|
||||||
char *buffer = (char *) malloc(sizeof(char) * BUFFER_LENGTH);
|
char *buffer = (char *) malloc(sizeof(char) * BUFFER_LENGTH);
|
||||||
@@ -227,9 +244,9 @@ int main(void) {
|
|||||||
}
|
}
|
||||||
free(line);
|
free(line);
|
||||||
free(buffer);
|
free(buffer);
|
||||||
printf("Done parsing!\n");
|
|
||||||
person *list = NULL;
|
person *list = NULL;
|
||||||
make_list(hashmap, &list, MIN_COUNT);
|
make_list(hashmap, &list, MIN_COUNT);
|
||||||
|
check_list(list);
|
||||||
display(list);
|
display(list);
|
||||||
clean_list(list);
|
clean_list(list);
|
||||||
return 0;
|
return 0;
|
||||||
|
41
validate_extraction.sh
Executable file
41
validate_extraction.sh
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "Running full validation with working pipeline..."
|
||||||
|
|
||||||
|
# Generate complete bash results using the proven pipeline
|
||||||
|
echo "Extracting all surnames (this may take a few minutes for 4.7GB file)..."
|
||||||
|
{
|
||||||
|
grep -E "<(author|editor)" dblp.xml | \
|
||||||
|
sed -E 's/.*<(author|editor)[^>]*>//; s/<\/(author|editor)>.*//' | \
|
||||||
|
awk '{
|
||||||
|
if ($NF ~ /^[0-9]{4}$/) { NF-- }
|
||||||
|
if (NF > 0) {
|
||||||
|
surname = $NF
|
||||||
|
gsub(/&[^;]*;/, "", surname)
|
||||||
|
if (length(surname) > 0) print surname
|
||||||
|
}
|
||||||
|
}' | \
|
||||||
|
sort | uniq -c | \
|
||||||
|
awk '$1 >= 10000 {print $2, $1}' | \
|
||||||
|
sort -k2 -nr
|
||||||
|
} > bash_results.txt
|
||||||
|
|
||||||
|
echo "Bash extraction complete. Results: $(wc -l < bash_results.txt) surnames"
|
||||||
|
|
||||||
|
# Get your C program results
|
||||||
|
./main > c_results.txt
|
||||||
|
|
||||||
|
echo "C extraction complete. Results: $(wc -l < c_results.txt) surnames"
|
||||||
|
|
||||||
|
# Quick comparison of top entries
|
||||||
|
echo "Top 5 comparison:"
|
||||||
|
echo "=== C Program ==="
|
||||||
|
head -5 c_results.txt
|
||||||
|
echo "=== Bash Script ==="
|
||||||
|
head -5 bash_results.txt
|
||||||
|
|
||||||
|
# Check Wang specifically
|
||||||
|
echo "Wang comparison:"
|
||||||
|
echo "C: $(grep "^Wang " c_results.txt)"
|
||||||
|
echo "Bash: $(grep "^Wang " bash_results.txt)"
|
||||||
|
|
Reference in New Issue
Block a user