42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
'''
|
|
This implements: http://en.wikipedia.org/wiki/Inverted_index of 28/07/10
|
|
'''
|
|
|
|
from pprint import pprint as pp
|
|
from glob import glob
|
|
try: reduce
|
|
except: from functools import reduce
|
|
try: raw_input
|
|
except: raw_input = input
|
|
|
|
|
|
def parsetexts(fileglob='InvertedIndex/T*.txt'):
|
|
texts, words = {}, set()
|
|
for txtfile in glob(fileglob):
|
|
with open(txtfile, 'r') as f:
|
|
txt = f.read().split()
|
|
words |= set(txt)
|
|
texts[txtfile.split('\\')[-1]] = txt
|
|
return texts, words
|
|
|
|
def termsearch(terms): # Searches simple inverted index
|
|
return reduce(set.intersection,
|
|
(invindex[term] for term in terms),
|
|
set(texts.keys()))
|
|
|
|
texts, words = parsetexts()
|
|
print('\nTexts')
|
|
pp(texts)
|
|
print('\nWords')
|
|
pp(sorted(words))
|
|
|
|
invindex = {word:set(txt
|
|
for txt, wrds in texts.items() if word in wrds)
|
|
for word in words}
|
|
print('\nInverted Index')
|
|
pp({k:sorted(v) for k,v in invindex.items()})
|
|
|
|
terms = ["what", "is", "it"]
|
|
print('\nTerm Search for: ' + repr(terms))
|
|
pp(sorted(termsearch(terms)))
|