In [1]:
def load_valid_words(file_path='wordle_words.txt'):
    """
    Load valid five-letter words from a specified text file.

    Parameters:
    - file_path (str): The path to the text file containing valid words.

    Returns:
    - list[str]: A list of valid words loaded from the file.
    """
    with open(file_path, 'r') as file:
        valid_words = [line.strip() for line in file if len(line.strip()) == 5]
    return valid_words

In [2]:
from stable_baselines3 import PPO, DQN  # Or any other suitable RL algorithm
from stable_baselines3.common.env_checker import check_env
from letter_guess import LetterGuessingEnv
from tqdm import tqdm

In [3]:
env = LetterGuessingEnv(valid_words=load_valid_words())  # Make sure to load your valid words
check_env(env)  # Optional: Verify the environment is compatible with SB3

In [4]:
initial_state = env.clone_state()

In [13]:
obs, _ = env.reset()

In [14]:
model_save_path = "wordle_ppo_model"
model = PPO.load(model_save_path)

In [15]:
action, _ = model.predict(obs)
obs, reward, done, _, info = env.step(action)

In [24]:
action % 26

5

In [28]:
ord('f') - ord('a')

5

In [26]:
chr(ord('a') + action % 26)

'f'

In [16]:
obs

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1])

In [17]:
env.set_state(initial_state)

In [20]:
all(env.get_obs() == obs)

False

In [None]:
# Perform your action to see the outcome
action = # Define your action
observation, reward, done, info = env.step(action)

# Revert to the initial state
env.env.set_state(initial_state)

In [4]:
import wandb
from wandb.integration.sb3 import WandbCallback

In [5]:
model_save_path = "wordle_ppo_model_test"
config = {
    "policy_type": "MlpPolicy",
    "total_timesteps": 200_000
}
run = wandb.init(
    project="wordle",
    config=config,
    sync_tensorboard=True
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mltcptgeneral[0m ([33mfulltime[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
model = PPO(config["policy_type"], env=env, verbose=2, tensorboard_log=f"runs/{run.id}", batch_size=64)

# Train for a certain number of timesteps
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        model_save_path=f"models/{run.id}",
        verbose=2,
    ),
	progress_bar=True
)

run.finish()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to runs/cyh5nscz/PPO_1


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.54     |
|    ep_rew_mean     | -3.66    |
| time/              |          |
|    fps             | 721      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.53        |
|    ep_rew_mean          | -3.61       |
| time/                   |             |
|    fps                  | 718         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011673957 |
|    clip_fraction        | 0.0292      |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.25       |
|    explained_variance   | -0.126      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.576       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0197     |
|    value_loss           | 3.58        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.7         |
|    ep_rew_mean          | -3.56       |
| time/                   |             |
|    fps                  | 698         |
|    iterations           | 3           |
|    time_elapsed         | 8           |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.019258872 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.22       |
|    explained_variance   | -0.211      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.187       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0215     |
|    value_loss           | 0.637       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.73        |
|    ep_rew_mean          | -3.43       |
| time/                   |             |
|    fps                  | 681         |
|    iterations           | 4           |
|    time_elapsed         | 12          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.021500897 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.17       |
|    explained_variance   | 0.378       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.185       |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0214     |
|    value_loss           | 0.479       |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.92        |
|    ep_rew_mean          | -3.36       |
| time/                   |             |
|    fps                  | 682         |
|    iterations           | 5           |
|    time_elapsed         | 14          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018113121 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.11       |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.203       |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0183     |
|    value_loss           | 0.455       |
-----------------------------------------


In [None]:
model.save(model_save_path)

In [None]:
model = PPO.load(model_save_path)

In [None]:
rewards = 0
for i in tqdm(range(1000)):
    obs, _ = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _, info = env.step(action)
        rewards += reward
print(rewards / 1000)