156 lines
5.2 KiB
Python
156 lines
5.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Not Python 3.x (Can't compare str and int)
|
||
|
||
|
||
from itertools import groupby
|
||
from unicodedata import decomposition, name
|
||
from pprint import pprint as pp
|
||
|
||
commonleaders = ['the'] # lowercase leading words to ignore
|
||
replacements = {u'ß': 'ss', # Map single char to replacement string
|
||
u'ſ': 's',
|
||
u'ʒ': 's',
|
||
}
|
||
|
||
hexdigits = set('0123456789abcdef')
|
||
decdigits = set('0123456789') # Don't use str.isnumeric
|
||
|
||
def splitchar(c):
|
||
' De-ligature. De-accent a char'
|
||
de = decomposition(c)
|
||
if de:
|
||
# Just the words that are also hex numbers
|
||
de = [d for d in de.split()
|
||
if all(c.lower()
|
||
in hexdigits for c in d)]
|
||
n = name(c, c).upper()
|
||
# (Gosh it's onerous)
|
||
if len(de)> 1 and 'PRECEDE' in n:
|
||
# E.g. ʼn LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
|
||
de[1], de[0] = de[0], de[1]
|
||
tmp = [ unichr(int(k, 16)) for k in de]
|
||
base, others = tmp[0], tmp[1:]
|
||
if 'LIGATURE' in n:
|
||
# Assume two character ligature
|
||
base += others.pop(0)
|
||
else:
|
||
base = c
|
||
return base
|
||
|
||
|
||
def sortkeygen(s):
|
||
'''Generate 'natural' sort key for s
|
||
|
||
Doctests:
|
||
>>> sortkeygen(' some extra spaces ')
|
||
[u'some extra spaces']
|
||
>>> sortkeygen('CasE InseNsItIve')
|
||
[u'case insensitive']
|
||
>>> sortkeygen('The Wind in the Willows')
|
||
[u'wind in the willows']
|
||
>>> sortkeygen(u'\462 ligature')
|
||
[u'ij ligature']
|
||
>>> sortkeygen(u'\335\375 upper/lower case Y with acute accent')
|
||
[u'yy upper/lower case y with acute accent']
|
||
>>> sortkeygen('foo9.txt')
|
||
[u'foo', 9, u'.txt']
|
||
>>> sortkeygen('x9y99')
|
||
[u'x', 9, u'y', 99]
|
||
'''
|
||
# Ignore leading and trailing spaces
|
||
s = unicode(s).strip()
|
||
# All space types are equivalent
|
||
s = ' '.join(s.split())
|
||
# case insentsitive
|
||
s = s.lower()
|
||
# Title
|
||
words = s.split()
|
||
if len(words) > 1 and words[0] in commonleaders:
|
||
s = ' '.join( words[1:])
|
||
# accent and ligatures
|
||
s = ''.join(splitchar(c) for c in s)
|
||
# Replacements (single char replaced by one or more)
|
||
s = ''.join( replacements.get(ch, ch) for ch in s )
|
||
# Numeric sections as numerics
|
||
s = [ int("".join(g)) if isinteger else "".join(g)
|
||
for isinteger,g in groupby(s, lambda x: x in decdigits)]
|
||
|
||
return s
|
||
|
||
def naturalsort(items):
|
||
''' Naturally sort a series of strings
|
||
|
||
Doctests:
|
||
>>> naturalsort(['The Wind in the Willows','The 40th step more',
|
||
'The 39 steps', 'Wanda'])
|
||
['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']
|
||
|
||
'''
|
||
return sorted(items, key=sortkeygen)
|
||
|
||
if __name__ == '__main__':
|
||
import string
|
||
|
||
ns = naturalsort
|
||
|
||
print '\n# Ignoring leading spaces'
|
||
txt = ['%signore leading spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Ignoring multiple adjacent spaces (m.a.s)'
|
||
txt = ['ignore m.a.s%s spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Equivalent whitespace characters'
|
||
txt = ['Equiv.%sspaces: 3%+i' % (ch, i-3)
|
||
for i,ch in enumerate(reversed(string.whitespace))]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Case Indepenent sort'
|
||
s = 'CASE INDEPENENT'
|
||
txt = [s[:i].lower() + s[i:] + ': 3%+i' % (i-3) for i in range(1,5)]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Numeric fields as numerics'
|
||
txt = ['foo100bar99baz0.txt', 'foo100bar10baz0.txt',
|
||
'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Title sorts'
|
||
txt = ['The Wind in the Willows','The 40th step more',
|
||
'The 39 steps', 'Wanda']
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Equivalent accented characters (and case)'
|
||
txt = ['Equiv. %s accents: 2%+i' % (ch, i-2)
|
||
for i,ch in enumerate(u'\xfd\xddyY')]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Separated ligatures'
|
||
txt = [u'\462 ligatured ij', 'no ligature',]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; pp(sorted(txt))
|
||
print 'Naturally sorted:'; pp(ns(txt))
|
||
|
||
print '\n# Character replacements'
|
||
s = u'ʒſßs' # u'\u0292\u017f\xdfs'
|
||
txt = ['Start with an %s: 2%+i' % (ch, i-2)
|
||
for i,ch in enumerate(s)]
|
||
print 'Text strings:'; pp(txt)
|
||
print 'Normally sorted :'; print '\n'.join(sorted(txt))
|
||
print 'Naturally sorted:'; print '\n'.join(ns(txt))
|