Files
c_prog/surnames.c
2025-06-13 05:51:53 +02:00

234 lines
5.9 KiB
C

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFFER_LENGTH 128
#define LINE_LENGTH 1024
#define HASH_BUCKETS 4000037
#define MIN_COUNT 10000
/*
* Usage: pipe dblp.xml to the program or have it in the same folder as the program
*/
void string_ncopy(char *dest, const char *src, size_t max_len) {
size_t i = 0;
while (i < max_len - 1 && src[i]) {
dest[i] = src[i];
i++;
}
dest[i] = '\0';
}
typedef struct person {
struct person *next;
char name[BUFFER_LENGTH];
int count;
} person;
person* newPerson(const char *name) {
person *p = (person *) malloc(sizeof(person));
string_ncopy(p->name, name, BUFFER_LENGTH);
p->count = 1;
p->next = NULL;
return p;
}
void sorted_name_insert(person **head, char *name) {
person *p = *head;
while (p != NULL) {
if (strcmp(p->name, name) == 0) {
p->count++;
return;
}
p = p->next;
}
person *node = newPerson(name);
if (*head == NULL || strcmp((*head)->name, name) > 0) {
node->next = *head;
*head = node;
return;
}
p = *head;
while (p->next != NULL && strcmp(p->next->name, name) < 0) {
p = p->next;
}
node->next = p->next;
p->next = node;
}
void sorted_count_insert(person **head, person *node) {
if (*head == NULL) {
*head = node;
} else {
person *p = *head;
person *p_prev = NULL;
int cmp = p->count - node->count;
while (p->next != NULL && cmp > 0) {
p_prev = p;
p = p->next;
cmp = p->count - node->count;
}
if (p_prev == NULL) {
node->next = *head;
*head = node;
} else if (p->next != NULL && cmp < 0) {
node->next = p;
p_prev->next = node;
} else {
p->next = node;
node->next = NULL;
}
}
}
void display(person *head) {
person *p = head;
while (p != NULL) {
printf("%s %d\n", p->name, p->count);
p = p->next;
}
}
//djb2 hash http://www.cse.yorku.ca/~oz/hash.html
u_long hash(const unsigned char *str) {
u_long hash = 5381;
int c;
while ((c = *str++)) {
hash = ((hash << 5) + hash) + c;
}
return hash;
}
void hm_insert(person **hashmap, char *name) {
u_long hash_value = hash((unsigned char *) name);
hash_value = hash_value % HASH_BUCKETS;
sorted_name_insert(&hashmap[hash_value], name);
}
void parse_line(char *line, char *buffer) {
char *line_it = line;
if (*line_it == '<') {
line_it++;
size_t i = 0;
while (i < BUFFER_LENGTH - 1 && *line_it != ' ' && *line_it != '>' &&
*line_it != '\0' && *line_it != '\n') {
buffer[i] = *line_it;
line_it++;
i++;
}
buffer[i] = '\0';
if (strcmp(buffer, "author") == 0 || strcmp(buffer, "editor") == 0) {
memset(buffer, 0, BUFFER_LENGTH);
while (*line_it != '>') {
line_it++;
}
line_it++;
char *content_start = line_it, *last_space = NULL, *second_last_space = NULL;
while (*line_it && *line_it != '<') {
if (*line_it == ' ') {
second_last_space = last_space;
last_space = line_it;
}
line_it++;
}
char *surname_start, *surname_end;
if (last_space && isdigit(*(last_space+1))) {
surname_start = second_last_space ? second_last_space + 1 : content_start;
surname_end = last_space ? last_space : line_it;
} else {
surname_start = last_space ? last_space + 1 : content_start;
surname_end = line_it;
}
size_t name_length = surname_end - surname_start;
memcpy(buffer, surname_start, name_length);
buffer[name_length] = '\0';
} else {
memset(buffer, 0, BUFFER_LENGTH);
}
}
}
void make_list(person **hashmap, person **list, const int min_count) {
size_t i = 0;
for (i = 0; i < HASH_BUCKETS; i++) {
person *p = hashmap[i];
while (p != NULL) {
person *p_next = p->next;
if (p->count >= min_count) {
p->next = NULL;
sorted_count_insert(list, p);
} else {
free(p);
}
p = p_next;
}
hashmap[i] = NULL;
}
free(hashmap);
}
void clean_list(person *list) {
person *p = list;
while (p != NULL) {
person *next = p->next;
free(p);
p = next;
}
}
void clean_memory(person **hashmap) {
for (int i = 0; i < HASH_BUCKETS; i++) {
person *p = hashmap[i];
while (p != NULL) {
person *next = p->next;
free(p);
p = next;
}
}
free(hashmap);
}
int main(void) {
person **hashmap = (person **) malloc(sizeof(person *) * HASH_BUCKETS);
for (int i = 0; i < HASH_BUCKETS; i++) {
hashmap[i] = NULL;
}
FILE *fp = fopen("dblp.xml", "r");
char *line = NULL;
size_t line_len = 0;
char *buffer = (char *) malloc(sizeof(char) * BUFFER_LENGTH);
if (fp) {
while (getline(&line, &line_len, fp) >= 0) {
memset(buffer, 0, BUFFER_LENGTH);
parse_line(line, buffer);
if (*buffer != '\0') {
hm_insert(hashmap, buffer);
}
}
fclose(fp);
} else {
while (getline(&line, &line_len, stdin) >= 0) {
memset(buffer, 0, BUFFER_LENGTH);
parse_line(line, buffer);
if (*buffer != '\0') {
hm_insert(hashmap, buffer);
}
}
}
free(line);
free(buffer);
person *list = NULL;
make_list(hashmap, &list, MIN_COUNT);
display(list);
clean_list(list);
return 0;
}