RosettaCodeData/Task/Natural-sorting/C/natural-sorting-1.c

258 lines
6.3 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>
#include <string.h>
#include <locale.h>
typedef struct wstr {
wchar_t *s;
int n, alloc;
} wstr;
#define w_del(w) { free(w->s); free(w); }
#define forchars(i, c, w) for(i = 0, c = w->s[0]; i < w->n && c; c = w->s[++i])
wstr *w_new()
{
wstr *w = malloc(sizeof(wstr));
w->alloc = 1;
w->n = 0;
w->s = malloc(sizeof(wchar_t));
w->s[0] = 0;
return w;
}
void w_append(wstr *w, wchar_t c)
{
int n = w->n + 1;
if (n >= w->alloc) {
w->alloc *= 2;
w->s = realloc(w->s, w->alloc * sizeof(wchar_t));
}
w->s[w->n++] = c;
w->s[w->n] = 0;
}
wstr *w_make(wchar_t *s)
{
int i, len = wcslen(s);
wstr *w = w_new();
for (i = 0; i < len; i++) w_append(w, s[i]);
return w;
}
typedef void (*wtrans_func)(wstr *, wstr *);
void w_transform(wstr *in, wtrans_func f)
{
wstr t, *out = w_new();
f(in, out);
t = *in; *in = *out; *out = t;
w_del(out);
}
#define transfunc(x) void w_##x(wstr *in, wstr *out)
transfunc(nocase) {
int i;
wchar_t c;
forchars(i, c, in) w_append(out, towlower(c));
}
transfunc(despace) {
int i, gotspace = 0;
wchar_t c;
forchars(i, c, in) {
if (!iswspace(c)) {
if (gotspace && out->n)
w_append(out, L' ');
w_append(out, c);
gotspace = 0;
} else gotspace = 1;
}
}
static const wchar_t *const tbl_accent[] = { /* copied from Perl6 code */
L"Þ", L"TH", L"þ", L"th", L"Ð", L"TH", L"ð", L"th", L"À", L"A",
L"Á", L"A", L"Â", L"A", L"Ã", L"A", L"Ä", L"A", L"Å", L"A", L"à",
L"a", L"á", L"a", L"â", L"a", L"ã", L"a", L"ä", L"a", L"å", L"a",
L"Ç", L"C", L"ç", L"c", L"È", L"E", L"É", L"E", L"Ê", L"E", L"Ë",
L"E", L"è", L"e", L"é", L"e", L"ê", L"e", L"ë", L"e", L"Ì",
L"I", L"Í", L"I", L"Î", L"I", L"Ï", L"I", L"ì", L"i", L"í",
L"i", L"î", L"i", L"ï", L"i", L"Ò", L"O", L"Ó", L"O", L"Ô",
L"O", L"Õ", L"O", L"Ö", L"O", L"Ø", L"O", L"ò", L"o", L"ó", L"o",
L"ô", L"o", L"õ", L"o", L"ö", L"o", L"ø", L"o", L"Ñ", L"N", L"ñ", L"n",
L"Ù", L"U", L"Ú", L"U", L"Û", L"U", L"Ü", L"U", L"ù", L"u", L"ú", L"u",
L"û", L"u", L"ü", L"u", L"Ý", L"Y", L"ÿ", L"y", L"ý", L"y" };
static const wchar_t *const tbl_ligature[] = {
L"Æ", L"AE", L"æ", L"ae", L"ß", L"ss",
L"", L"ffl", L"", L"ffi", L"", L"fi", L"", L"ff", L"", L"fl",
L"ſ", L"s", L"ʒ", L"z", L"", L"st", /* ... come on ... */
};
void w_char_repl(wstr *in, wstr *out, const wchar_t *const *tbl, int len)
{
int i, j, k;
wchar_t c;
forchars(i, c, in) {
for (j = k = 0; j < len; j += 2) {
if (c != tbl[j][0]) continue;
for (k = 0; tbl[j + 1][k]; k++)
w_append(out, tbl[j + 1][k]);
break;
}
if (!k) w_append(out, c);
}
}
transfunc(noaccent) {
w_char_repl(in, out, tbl_accent, sizeof(tbl_accent)/sizeof(wchar_t*));
}
transfunc(noligature) {
w_char_repl(in, out, tbl_ligature, sizeof(tbl_ligature)/sizeof(wchar_t*));
}
static const wchar_t *const tbl_article[] = {
L"the", L"a", L"of", L"to", L"is", L"it" };
#define N_ARTICLES sizeof(tbl_article)/sizeof(tbl_article[0])
transfunc(noarticle) {
int i, j, n;
wchar_t c, c0 = 0;
forchars(i, c, in) {
if (!c0 || (iswalnum(c) && !iswalnum(c0))) { /* word boundary */
for (j = N_ARTICLES - 1; j >= 0; j--) {
n = wcslen(tbl_article[j]);
if (wcsncasecmp(in->s + i, tbl_article[j], n))
continue;
if (iswalnum(in->s[i + n])) continue;
i += n;
break;
}
if (j < 0) w_append(out, c);
} else
w_append(out, c);
c0 = c;
}
}
enum { wi_space = 0, wi_case, wi_accent, wi_lig, wi_article, wi_numeric };
#define WS_NOSPACE (1 << wi_space)
#define WS_NOCASE (1 << wi_case)
#define WS_ACCENT (1 << wi_accent)
#define WS_LIGATURE (1 << wi_lig)
#define WS_NOARTICLE (1 << wi_article)
#define WS_NUMERIC (1 << wi_numeric)
const wtrans_func trans_funcs[] = {
w_despace, w_nocase, w_noaccent, w_noligature, w_noarticle, 0
};
const char *const flagnames[] = {
"collapse spaces",
"case insensitive",
"disregard accent",
"decompose ligatures",
"discard common words",
"numeric",
};
typedef struct { wchar_t* s; wstr *w; } kw_t;
int w_numcmp(const void *a, const void *b)
{
wchar_t *pa = ((const kw_t*)a)->w->s, *pb = ((const kw_t*)b)->w->s;
int sa, sb, ea, eb;
while (*pa && *pb) {
if (iswdigit(*pa) && iswdigit(*pb)) {
/* skip leading zeros */
sa = sb = 0;
while (pa[sa] == L'0') sa++;
while (pb[sb] == L'0') sb++;
/* find end of numbers */
ea = sa; eb = sb;
while (iswdigit(pa[ea])) ea++;
while (iswdigit(pb[eb])) eb++;
if (eb - sb > ea - sa) return -1;
if (eb - sb < ea - sa) return 1;
while (sb < eb) {
if (pa[sa] > pb[sb]) return 1;
if (pa[sa] < pb[sb]) return -1;
sa++; sb++;
}
pa += ea; pb += eb;
}
else if (iswdigit(*pa)) return 1;
else if (iswdigit(*pb)) return -1;
else {
if (*pa > *pb) return 1;
if (*pa < *pb) return -1;
pa++; pb++;
}
}
return (!*pa && !*pb) ? 0 : *pa ? 1 : -1;
}
int w_cmp(const void *a, const void *b)
{
return wcscmp(((const kw_t*)a)->w->s, ((const kw_t*)b)->w->s);
}
void natural_sort(wchar_t **strings, int len, int flags)
{
int i, j;
kw_t *kws = malloc(sizeof(kw_t) * len);
for (i = 0; i < len; i++) {
kws[i].s = strings[i];
kws[i].w = w_make(strings[i]);
for (j = 0; j < wi_numeric; j++)
if (flags & (1 << j) && trans_funcs[j])
w_transform(kws[i].w, trans_funcs[j]);
}
qsort(kws, len, sizeof(kw_t), (flags & WS_NUMERIC) ? w_numcmp : w_cmp);
for (i = 0; i < len; i++) {
w_del(kws[i].w);
strings[i] = kws[i].s;
}
free(kws);
}
const wchar_t *const test[] = {
L" 0000098 nina", L"100 niño", L"99 Ninja", L"100 NINA",
L" The work is so difficult to do it took ſome 100 aeons. ",
L"The work is so difficult it took some 100 aeons.",
L" The work is so difficult it took ſome 99 æons. ",
};
#define N_STRINGS sizeof(test)/sizeof(*test)
void test_sort(int flags)
{
int i, j;
const wchar_t *str[N_STRINGS];
memcpy(str, test, sizeof(test));
printf("Sort flags: (");
for (i = 0, j = flags; j; i++, j >>= 1)
if ((j & 1))
printf("%s%s", flagnames[i], j > 1 ? ", ":")\n");
natural_sort((wchar_t **)str, N_STRINGS, flags);
for (i = 0; i < N_STRINGS; i++)
printf("%ls\n", str[i]);
printf("\n");
}
int main()
{
setlocale(LC_CTYPE, "");
test_sort(WS_NOSPACE);
test_sort(WS_NOCASE);
test_sort(WS_NUMERIC);
test_sort(WS_NOARTICLE|WS_NOSPACE);
test_sort(WS_NOCASE|WS_NOSPACE|WS_ACCENT);
test_sort(WS_LIGATURE|WS_NOCASE|WS_NOSPACE|WS_NUMERIC|WS_ACCENT|WS_NOARTICLE);
return 0;
}