practice

import random
import numpy as np
import tflearn

def string_to_semi_redundant_sequences(string, seq_maxlen=25, redun_step=3):
    """ string_to_semi_redundant_sequences.

    Vectorize a string and returns parsed sequences and targets, along with
    the associated dictionary.

    Arguments:
        string: `str`. Lower-case text from input text file.
        seq_maxlen: `int`. Maximum length of a sequence. Default: 25.
        redun_step: `int`. Redundancy step. Default: 3.

    Returns:
        `tuple`: (inputs, targets, dictionary)
    """

    print("Vectorizing text...")
    chars = set(string)
    char_idx = {c: i for i, c in enumerate(chars)}

    sequences = []
    next_chars = []
    for i in range(0, len(string) - seq_maxlen, redun_step):
        sequences.append(string[i: i + seq_maxlen])
        next_chars.append(string[i + seq_maxlen])

    X = np.zeros((len(sequences), seq_maxlen, len(chars)), dtype=np.bool)
    Y = np.zeros((len(sequences), len(chars)), dtype=np.bool)
    for i, seq in enumerate(sequences):
        for t, char in enumerate(seq):
            X[i, t, char_idx[char]] = 1
        Y[i, char_idx[next_chars[i]]] = 1

    print("Text total length: " + str(len(string)))
    print("Distinct chars: " + str(len(chars)))
    print("Total sequences: " + str(len(sequences)))

    return X, Y, char_idx

def textfile_to_semi_redundant_sequences(path, seq_maxlen=25, redun_step=3):
    """ Vectorize Text file """
    text = open(path).read()
    # if to_lower_case:
    #     text = text.lower()
    return string_to_semi_redundant_sequences(text, seq_maxlen, redun_step)


def random_sequence_from_textfile(path, seq_maxlen):
    text = open(path).read()
    return random_sequence_from_string(text, seq_maxlen)

def random_sequence_from_string(string, seq_maxlen):
    rand_index = random.randint(0, len(string) - seq_maxlen - 1)
    return string[rand_index: rand_index + seq_maxlen]


path = "xajh.txt"
maxlen = 25

X, Y, char_idx = \
    textfile_to_semi_redundant_sequences(path, seq_maxlen=maxlen, redun_step=3)

g = tflearn.input_data([None, maxlen, len(char_idx)])
g = tflearn.lstm(g, 512, return_seq=True)
g = tflearn.dropout(g, 0.5)
# g = tflearn.lstm(g, 512, return_seq=True)
# g = tflearn.dropout(g, 0.5)
g = tflearn.lstm(g, 512)
g = tflearn.dropout(g, 0.5)
g = tflearn.fully_connected(g, len(char_idx), activation='softmax')
g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy',
                       learning_rate=0.001)

m = tflearn.SequenceGenerator(g, dictionary=char_idx,
                              seq_maxlen=maxlen,
                              clip_gradients=5.0,
                              checkpoint_path='model_shakespeare')

for i in range(50):
    seed = random_sequence_from_textfile(path, maxlen)
    m.fit(X, Y, validation_set=0.1, batch_size=128, n_epoch=1, run_id='shakespeare')
    print("-- TESTING...")
    print("-- Test with temperature of 1.0 --")
    print(m.generate(600, temperature=1.0, seq_seed=seed))
    # print("-- Test with temperature of 0.5 --")
    # print(m.generate(600, temperature=0.5, seq_seed=seed))

Last updated