RosettaCodeData/Task/Random-sentence-from-book/Python/random-sentence-from-book.py

from urllib.request import urlopen
import re
from string import punctuation
from collections import Counter, defaultdict
import random


# The War of the Worlds, by H. G. Wells
text_url = 'http://www.gutenberg.org/files/36/36-0.txt'
text_start = 'No one would have believed'

sentence_ending = '.!?'
sentence_pausing = ',;:'

def read_book(text_url, text_start) -> str:
    with urlopen(text_url) as book:
        text = book.read().decode('utf-8')
    return text[text.index(text_start):]

def remove_punctuation(text: str, keep=sentence_ending+sentence_pausing)-> str:
    "Remove punctuation, keeping some"
    to_remove = ''.join(set(punctuation) - set(keep))
    text = text.translate(str.maketrans(to_remove, ' ' * len(to_remove))).strip()
    text = re.sub(fr"[^a-zA-Z0-9{keep}\n ]+", ' ', text)
    # Remove duplicates and put space around remaining punctuation
    if keep:
        text = re.sub(f"([{keep}])+", r" \1 ", text).strip()
    if text[-1] not in sentence_ending:
        text += ' .'
    return text.lower()

def word_follows_words(txt_with_pauses_and_endings):
    "return dict of freq of words following one/two words"
    words = ['.'] + txt_with_pauses_and_endings.strip().split()

    # count of what word follows this
    word2next = defaultdict(lambda :defaultdict(int))
    word2next2 = defaultdict(lambda :defaultdict(int))
    for lh, rh in zip(words, words[1:]):
        word2next[lh][rh] += 1
    for lh, mid, rh in zip(words, words[1:], words[2:]):
        word2next2[(lh, mid)][rh] += 1

    return dict(word2next), dict(word2next2)

def gen_sentence(word2next, word2next2) -> str:

    s = ['.']
    s += random.choices(*zip(*word2next[s[-1]].items()))
    while True:
        s += random.choices(*zip(*word2next2[(s[-2], s[-1])].items()))
        if s[-1] in sentence_ending:
            break

    s  = ' '.join(s[1:]).capitalize()
    s = re.sub(fr" ([{sentence_ending+sentence_pausing}])", r'\1', s)
    s = re.sub(r" re\b", "'re", s)
    s = re.sub(r" s\b", "'s", s)
    s = re.sub(r"\bi\b", "I", s)

    return s

if __name__ == "__main__":
    txt_with_pauses_and_endings = remove_punctuation(read_book(text_url, text_start))
    word2next, word2next2 = word_follows_words(txt_with_pauses_and_endings)
    #%%
    sentence = gen_sentence(word2next, word2next2)
    print(sentence)