diff --git a/main.c b/main.c index 8d4ab2b..ece6c8c 100644 --- a/main.c +++ b/main.c @@ -4,245 +4,233 @@ #include #define BUFFER_LENGTH 128 +#define LINE_LENGTH 1024 #define HASH_BUCKETS 4000037 #define MIN_COUNT 10000 void string_ncopy(char *dest, const char *src, size_t max_len) { - size_t i = 0; - while (i < max_len - 1 && src[i]) { - dest[i] = src[i]; - i++; - } - dest[i] = '\0'; + size_t i = 0; + while (i < max_len - 1 && src[i]) { + dest[i] = src[i]; + i++; + } + dest[i] = '\0'; } typedef struct person { - struct person *next; - char *name; - int count; + struct person *next; + char name[BUFFER_LENGTH]; + int count; } person; void newPerson(person *p, const char *name) { - string_ncopy(p->name, name, BUFFER_LENGTH); - p->count = 1; - p->next = NULL; -} - -void insert(person **head, person *node) { - if (*head == NULL) { - *head = node; - } else { - person *p = *head; - int p_exists = strcmp(p->name, node->name); - while (p->next != NULL && p_exists != 0) { - p = p->next; - p_exists = strcmp(p->name, node->name); - } - if (p_exists == 0) { - p->count++; - } else { - p->next = node; - } - } + string_ncopy(p->name, name, BUFFER_LENGTH); + p->count = 1; + p->next = NULL; } void sorted_name_insert(person **head, char *name) { - person *node = (person *)malloc(sizeof(person)); - newPerson(node, name); - if (*head == NULL) { - *head = node; - } else { - person *p = *head; - person *p_prev = NULL; - int p_exists = strcmp(p->name, name); - if (p_exists > 0) { - node->next = *head; - *head = node; - return; - } - while (p->next != NULL && p_exists != 0) { - p_exists = strcmp(p->next->name, name); - if (p_exists > 0) { - node->next = p->next; - p->next = node; - return; - } - p = p->next; - } - if (p_exists == 0) { - p->count++; - free(node); + person *node = (person *) malloc(sizeof(person)); + newPerson(node, name); + if (*head == NULL) { + *head = node; } else { - p->next = node; + person *p = *head; + person *p_prev = NULL; + int cmp = strcmp(p->name, name); + while (p->next != NULL && cmp < 0) { + p_prev = p; + p = p->next; + cmp = strcmp(p->name, name); + } + if (cmp == 0){ + p->count++; + free(node); + }else if (p_prev == NULL) { + node->next = *head; + *head = node; + } else if (p->next != NULL && cmp < 0) { + node->next = p; + p_prev->next = node; + } else { + p->next = node; + node->next = NULL; + } } - } } void sorted_count_insert(person **head, person *node) { - if (*head == NULL) { - *head = node; - } else { - person *p = *head; - person *p_prev = NULL; - int cmp = p->count - node->count; - while (p->next != NULL && cmp > 0) { - p_prev = p; - p = p->next; - cmp = p->count - node->count; - } - if (p_prev == NULL) { - node->next = *head; - *head = node; - } else if (p->next != NULL && cmp < 0) { - node->next = p; - p_prev->next = node; + if (*head == NULL) { + *head = node; } else { - p->next = node; - node->next = NULL; + person *p = *head; + person *p_prev = NULL; + int cmp = p->count - node->count; + while (p->next != NULL && cmp > 0) { + p_prev = p; + p = p->next; + cmp = p->count - node->count; + } + if (p_prev == NULL) { + node->next = *head; + *head = node; + } else if (p->next != NULL && cmp < 0) { + node->next = p; + p_prev->next = node; + } else { + p->next = node; + node->next = NULL; + } } - } } void display(person *head) { - person *p = head; - while (p != NULL) { - printf("%s %d\n", p->name, p->count); - p = p->next; - } + person *p = head; + while (p != NULL) { + printf("%s %d\n", p->name, p->count); + p = p->next; + } } +//djb2 hash http://www.cse.yorku.ca/~oz/hash.html u_long hash(const unsigned char *str) { - u_long hash = 5381; - int c; - while ((c = *str++)) { - hash = ((hash << 5) + hash) + c; - } - return hash; + u_long hash = 5381; + int c; + while ((c = *str++)) { + hash = ((hash << 5) + hash) + c; + } + return hash; } void hm_insert(person **hashmap, char *name) { - u_long hash_value = hash((unsigned char *)name); - hash_value = hash_value % HASH_BUCKETS; - sorted_name_insert(&hashmap[hash_value], name); + u_long hash_value = hash((unsigned char *) name); + hash_value = hash_value % HASH_BUCKETS; + sorted_name_insert(&hashmap[hash_value], name); } void parse_line(char *line, char *buffer) { - char *line_it = line; - if (*line_it == '<') { - line_it++; - size_t i = 0; - while (i < BUFFER_LENGTH - 1 && *line_it != ' ' && *line_it != '>' && - *line_it != '\0' && *line_it != '\n') { - buffer[i] = *line_it; - line_it++; - i++; - } - buffer[i] = '\0'; - if (strcmp(buffer, "author") == 0 || strcmp(buffer, "editor") == 0) { - while (*line_it != '>') { + char *line_it = line; + if (*line_it == '<') { line_it++; - } - line_it++; - char *surname_end = line_it, *surname_start; - while (*line_it != '<') { - if (*line_it == ' ') { - surname_start = surname_end; - surname_end = line_it; + size_t i = 0; + while (i < BUFFER_LENGTH - 1 && *line_it != ' ' && *line_it != '>' && + *line_it != '\0' && *line_it != '\n') { + buffer[i] = *line_it; + line_it++; + i++; + } + buffer[i] = '\0'; + if (strcmp(buffer, "author") == 0 || strcmp(buffer, "editor") == 0) { + memset(buffer, 0, BUFFER_LENGTH); + while (*line_it != '>') { + line_it++; + } + line_it++; + char *surname_end = line_it, *surname_start = line_it; + while (*line_it && *line_it != '<') { + if (*line_it == ' ') { + surname_start = surname_end; + surname_end = line_it; + } + line_it++; + } + bool only_numbers = true; + char *c = surname_end; + while (only_numbers && c != line_it) { + if (!isdigit(*c)) { + only_numbers = false; + } + c++; + } + + if (!only_numbers) { + surname_start = surname_end; + surname_end = line_it; + } + size_t name_length = surname_end - surname_start; + memcpy(buffer, surname_start, name_length); + buffer[name_length] = '\0'; + } else { + memset(buffer, 0, BUFFER_LENGTH); } - line_it++; - printf("%s\n", line_it); - } - printf("line_it: %s\n", line_it); - printf("sn start: %s\n", surname_start); - printf("sn end: %s\n", surname_end); - if (!isdigit(surname_end + 1)) { - surname_start = surname_end + 1; - surname_end = line_it - 1; - } else { - surname_start++; - surname_end--; - } - i = 0; - size_t name_length = surname_end - surname_start; - while (i < name_length) { - buffer[i] = surname_start[i]; - } - buffer[i] = '\0'; } - } } void make_list(person **hashmap, person **list, const int min_count) { - size_t i = 0; - for (i = 0; i < HASH_BUCKETS; i++) { - person *p = hashmap[i]; - while (p != NULL) { - person *p_next = p->next; - if (p->count >= min_count) { - p->next = NULL; - sorted_count_insert(list, p); - } else { - free(p); - } - p = p_next; + size_t i = 0; + for (i = 0; i < HASH_BUCKETS; i++) { + person *p = hashmap[i]; + while (p != NULL) { + person *p_next = p->next; + if (p->count >= min_count) { + p->next = NULL; + sorted_count_insert(list, p); + } else { + free(p); + } + p = p_next; + } + hashmap[i] = NULL; } - hashmap[i] = NULL; - } - free(hashmap); + free(hashmap); } void clean_list(person *list) { - person *p = list; - while (p != NULL) { - person *next = p->next; - free(p); - p = next; - } + person *p = list; + while (p != NULL) { + person *next = p->next; + free(p); + p = next; + } } void clean_memory(person **hashmap) { - for (int i = 0; i < HASH_BUCKETS; i++) { - person *p = hashmap[i]; - while (p != NULL) { - person *next = p->next; - free(p); - p = next; + for (int i = 0; i < HASH_BUCKETS; i++) { + person *p = hashmap[i]; + while (p != NULL) { + person *next = p->next; + free(p); + p = next; + } } - } - free(hashmap); + free(hashmap); } int main(void) { - person **hashmap = (person **)malloc(sizeof(person *) * HASH_BUCKETS); - for (int i = 0; i < HASH_BUCKETS; i++) { - hashmap[i] = NULL; - } - FILE *fp = fopen("dblp.xml", "r"); - char *line = NULL; - size_t line_len = 0; - char *buffer = (char *)malloc(sizeof(char) * BUFFER_LENGTH); - if (fp) { - while (!(getline(&line, &line_len, fp) < 0)) { - parse_line(line, buffer); - if (buffer != NULL) { - hm_insert(hashmap, buffer); - } + person **hashmap = (person **) malloc(sizeof(person *) * HASH_BUCKETS); + for (int i = 0; i < HASH_BUCKETS; i++) { + hashmap[i] = NULL; } - fclose(fp); - } else { - while (!(getline(&line, &line_len, stdin) < 0)) { - printf("%s", line); - parse_line(line, buffer); - if (buffer != NULL) { - hm_insert(hashmap, buffer); - } + FILE *fp = fopen("dblp.xml", "r"); + // char *line = malloc(sizeof(char) * LINE_LENGTH); + // size_t line_len = LINE_LENGTH; + char *line = NULL; + size_t line_len = 0; + char *buffer = (char *) malloc(sizeof(char) * BUFFER_LENGTH); + if (fp) { + while (getline(&line, &line_len, fp) >= 0) { + memset(buffer, 0, BUFFER_LENGTH); + parse_line(line, buffer); + if (*buffer != '\0') { + hm_insert(hashmap, buffer); + } + } + fclose(fp); + } else { + while (getline(&line, &line_len, stdin) >= 0) { + memset(buffer, 0, BUFFER_LENGTH); + parse_line(line, buffer); + if (*buffer != '\0') { + hm_insert(hashmap, buffer); + } + } } - } - free(buffer); - printf("Done parsing!\n"); - person *list = NULL; - make_list(hashmap, &list, MIN_COUNT); - display(list); - clean_list(list); - return 0; -} + free(line); + free(buffer); + printf("Done parsing!\n"); + person *list = NULL; + make_list(hashmap, &list, MIN_COUNT); + display(list); + clean_list(list); + return 0; +} \ No newline at end of file