RosettaCodeData/Task/Natural-sorting/Python/natural-sorting.py

# -*- coding: utf-8 -*-
# Not Python 3.x (Can't compare str and int)


from itertools import groupby
from unicodedata import decomposition, name
from pprint import pprint as pp

commonleaders = ['the'] # lowercase leading words to ignore
replacements = {u'ß': 'ss',  # Map single char to replacement string
                u'ſ': 's',
                u'ʒ': 's',
                }

hexdigits = set('0123456789abcdef')
decdigits = set('0123456789')   # Don't use str.isnumeric

def splitchar(c):
    ' De-ligature. De-accent a char'
    de = decomposition(c)
    if de:
        # Just the words that are also hex numbers
        de = [d for d in de.split()
                  if all(c.lower()
                         in hexdigits for c in d)]
        n = name(c, c).upper()
        # (Gosh it's onerous)
        if len(de)> 1 and 'PRECEDE' in n:
            # E.g. ŉ  LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
            de[1], de[0] = de[0], de[1]
        tmp = [ unichr(int(k, 16)) for k in de]
        base, others = tmp[0], tmp[1:]
        if 'LIGATURE' in n:
            # Assume two character ligature
            base += others.pop(0)
    else:
        base = c
    return base


def sortkeygen(s):
    '''Generate 'natural' sort key for s

    Doctests:
        >>> sortkeygen('  some extra    spaces  ')
        [u'some extra spaces']
        >>> sortkeygen('CasE InseNsItIve')
        [u'case insensitive']
        >>> sortkeygen('The Wind in the Willows')
        [u'wind in the willows']
        >>> sortkeygen(u'\462 ligature')
        [u'ij ligature']
        >>> sortkeygen(u'\335\375 upper/lower case Y with acute accent')
        [u'yy upper/lower case y with acute accent']
        >>> sortkeygen('foo9.txt')
        [u'foo', 9, u'.txt']
        >>> sortkeygen('x9y99')
        [u'x', 9, u'y', 99]
    '''
    # Ignore leading and trailing spaces
    s = unicode(s).strip()
    # All space types are equivalent
    s = ' '.join(s.split())
    # case insentsitive
    s = s.lower()
    # Title
    words = s.split()
    if len(words) > 1 and words[0] in commonleaders:
        s = ' '.join( words[1:])
    # accent and ligatures
    s = ''.join(splitchar(c) for c in s)
    # Replacements (single char replaced by one or more)
    s = ''.join( replacements.get(ch, ch) for ch in s )
    # Numeric sections as numerics
    s = [ int("".join(g)) if isinteger else "".join(g)
          for isinteger,g in groupby(s, lambda x: x in decdigits)]

    return s

def naturalsort(items):
    ''' Naturally sort a series of strings

    Doctests:
        >>> naturalsort(['The Wind in the Willows','The 40th step more',
                         'The 39 steps', 'Wanda'])
        ['The 39 steps', 'The 40th step more', 'Wanda', 'The Wind in the Willows']

    '''
    return sorted(items, key=sortkeygen)

if __name__ == '__main__':
    import string

    ns = naturalsort

    print '\n# Ignoring leading spaces'
    txt = ['%signore leading spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Ignoring multiple adjacent spaces (m.a.s)'
    txt = ['ignore m.a.s%s spaces: 2%+i' % (' '*i, i-2) for i in range(4)]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Equivalent whitespace characters'
    txt = ['Equiv.%sspaces: 3%+i' % (ch, i-3)
           for i,ch in enumerate(reversed(string.whitespace))]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Case Indepenent sort'
    s = 'CASE INDEPENENT'
    txt = [s[:i].lower() + s[i:] + ': 3%+i' % (i-3) for i in range(1,5)]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Numeric fields as numerics'
    txt = ['foo100bar99baz0.txt', 'foo100bar10baz0.txt',
           'foo1000bar99baz10.txt', 'foo1000bar99baz9.txt']
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Title sorts'
    txt = ['The Wind in the Willows','The 40th step more',
                         'The 39 steps', 'Wanda']
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Equivalent accented characters (and case)'
    txt = ['Equiv. %s accents: 2%+i' % (ch, i-2)
           for i,ch in enumerate(u'\xfd\xddyY')]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Separated ligatures'
    txt = [u'\462 ligatured ij', 'no ligature',]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; pp(sorted(txt))
    print 'Naturally sorted:'; pp(ns(txt))

    print '\n# Character replacements'
    s = u'ʒſßs' # u'\u0292\u017f\xdfs'
    txt = ['Start with an %s: 2%+i' % (ch, i-2)
           for i,ch in enumerate(s)]
    print 'Text strings:'; pp(txt)
    print 'Normally sorted :'; print '\n'.join(sorted(txt))
    print 'Naturally sorted:'; print '\n'.join(ns(txt))