RosettaCodeData/Task/Word-frequency/C/word-frequency.c

93 lines
2.8 KiB
C

#include <stdbool.h>
#include <stdio.h>
#include <glib.h>
typedef struct word_count_tag {
const char* word;
size_t count;
} word_count;
int compare_word_count(const void* p1, const void* p2) {
const word_count* w1 = p1;
const word_count* w2 = p2;
if (w1->count > w2->count)
return -1;
if (w1->count < w2->count)
return 1;
return 0;
}
bool get_top_words(const char* filename, size_t count) {
GError* error = NULL;
GMappedFile* mapped_file = g_mapped_file_new(filename, FALSE, &error);
if (mapped_file == NULL) {
fprintf(stderr, "%s\n", error->message);
g_error_free(error);
return false;
}
const char* text = g_mapped_file_get_contents(mapped_file);
if (text == NULL) {
fprintf(stderr, "File %s is empty\n", filename);
g_mapped_file_unref(mapped_file);
return false;
}
gsize file_size = g_mapped_file_get_length(mapped_file);
// Store word counts in a hash table
GHashTable* ht = g_hash_table_new_full(g_str_hash, g_str_equal,
g_free, g_free);
GRegex* regex = g_regex_new("\\w+", 0, 0, NULL);
GMatchInfo* match_info;
g_regex_match_full(regex, text, file_size, 0, 0, &match_info, NULL);
while (g_match_info_matches(match_info)) {
char* word = g_match_info_fetch(match_info, 0);
char* lower = g_utf8_strdown(word, -1);
g_free(word);
size_t* count = g_hash_table_lookup(ht, lower);
if (count != NULL) {
++*count;
g_free(lower);
} else {
count = g_new(size_t, 1);
*count = 1;
g_hash_table_insert(ht, lower, count);
}
g_match_info_next(match_info, NULL);
}
g_match_info_free(match_info);
g_regex_unref(regex);
g_mapped_file_unref(mapped_file);
// Sort words in decreasing order of frequency
size_t size = g_hash_table_size(ht);
word_count* words = g_new(word_count, size);
GHashTableIter iter;
gpointer key, value;
g_hash_table_iter_init(&iter, ht);
for (size_t i = 0; g_hash_table_iter_next(&iter, &key, &value); ++i) {
words[i].word = key;
words[i].count = *(size_t*)value;
}
qsort(words, size, sizeof(word_count), compare_word_count);
// Print the most common words
if (count > size)
count = size;
printf("Top %lu words\n", count);
printf("Rank\tCount\tWord\n");
for (size_t i = 0; i < count; ++i)
printf("%lu\t%lu\t%s\n", i + 1, words[i].count, words[i].word);
g_free(words);
g_hash_table_destroy(ht);
return true;
}
int main(int argc, char** argv) {
if (argc != 2) {
fprintf(stderr, "usage: %s file\n", argv[0]);
return EXIT_FAILURE;
}
if (!get_top_words(argv[1], 10))
return EXIT_FAILURE;
return EXIT_SUCCESS;
}