機械学習を用いた株価予測のデモ - 投資のためのデータサイエンス

今回は実用性とは若干乖離しますが、機械学習を用いた株価予測について調べましたので、関連コードを紹介します。
今回紹介するコードは英文のgitリポジトリのサイトからの引用です。
ここではLSTMという機械学習手法を用いています。LSTM(Long Short Term Memory) ネットワークは、長期的な依存関係を学習することのできる、RNN(ﾘｶﾚﾝﾄﾆｭｰﾗﾙﾈｯﾄﾜｰｸ)の特別な一種です。

まずライブラリをインポートして、次に株価データを獲得します。

# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam
from pandas_datareader import data as pdr
from datetime import date, datetime, timedelta
import yfinance as yf

# yahooサイトからデータをダウンロード
ticker = '9984.T' # 9984はソフトバンクグループ
end_date = datetime.today()
start_date = end_date - timedelta(days=180)
yf.pdr_override()
data = pdr.get_data_yahoo(ticker, start_date, end_date)
close_prices_AAPL = data['Close']

次にデータの前処理を行います。一般に機械学習モデルにかけるためにデータの標準化が必要です。そしてデータの次元をモデルにかけれらるように変形し、さらに学習用データとテスト用データに分けます。

# Reverse the order of the data
close_prices_AAPL_reverse = close_prices_AAPL # No Reverse
#close_prices_AAPL_reverse = close_prices_AAPL.iloc[::-1]

# Reset index to maintain the correct time series order in the plot
close_prices_AAPL_reverse.reset_index(drop=True, inplace=True)

# Data preprocessing
data = close_prices_AAPL_reverse.values.reshape(-1, 1)  # Reshape the data
data_normalized = data / np.max(data)  # Normalize the data

# Split the data into training and testing sets
train_size = int(len(data_normalized) * 0.8)
train_data = data_normalized[:train_size]
test_data = data_normalized[train_size:]

続いてLSTMモデルを構築します。モデルの構造はさらに改良することが可能とみられます。

# Function to create LSTM model
def create_lstm_model(units, activation, learning_rate):
    model = Sequential()
    model.add(LSTM(units=units, activation=activation, input_shape=(1, 1)))
    model.add(Dense(units=1))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

続いて学習用データを使い、評価基準としてRMSE（平均二乗誤差の平方根）を採用してパラメータのチューニングを行います。

# Define hyperparameters for tuning
lstm_units = [50, 100, 200]
lstm_activations = ['relu', 'tanh']
learning_rates = [0.001, 0.01, 0.1]
epochs = 100
batch_size = 32

# Perform hyperparameter tuning for LSTM model
best_rmse = float('inf')
best_lstm_model = None

for units in lstm_units:
    for activation in lstm_activations:
        for learning_rate in learning_rates:
            # Create and train LSTM model
            model = create_lstm_model(units=units, activation=activation, learning_rate=learning_rate)
            model.fit(train_data[:-1].reshape(-1, 1, 1), train_data[1:], epochs=epochs, batch_size=batch_size, verbose=0)

            # Predict on test data
            test_predictions = model.predict(test_data[:-1].reshape(-1, 1, 1)).flatten()

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_error(test_data[1:], test_predictions))

            # Check if current model has lower RMSE
            if rmse < best_rmse:
                best_rmse = rmse
                best_lstm_model = model

# Predict on the entire dataset using the best LSTM model
all_lstm_predictions = best_lstm_model.predict(data_normalized[:-1].reshape(-1, 1, 1)).flatten()

# Inverse normalize the LSTM predictions
all_lstm_predictions = all_lstm_predictions * np.max(data)

# Calculate the scaling factor based on the maximum value of the original data
scaling_factor = np.max(close_prices_AAPL_reverse)

最後にモデルを用いて直近の将来10時点の値を予測します。

# Function to predict future stock prices using the LSTM model
def predict_future_lstm(model, data, num_predictions, scaling_factor):
    predictions = []

    # Get the last data point from the input data
    last_data_point = data[-1]

    for _ in range(num_predictions):
        # Predict the next time step
        prediction = model.predict(last_data_point.reshape(1, 1, 1))
        predictions.append(prediction[0, 0])

        # Update last_data_point to include the predicted value for the next iteration
        last_data_point = np.append(last_data_point[1:], prediction)

    # Inverse normalize the predictions
    predictions = np.array(predictions) * scaling_factor

    return predictions

# Predict the next 10 days using the LSTM model
num_predictions = 10
lstm_predictions = predict_future_lstm(best_lstm_model, data_normalized, num_predictions, scaling_factor)

# Plot the LSTM predictions for the next 10 days
plt.figure(figsize=(10, 6))
plt.plot(close_prices_AAPL_reverse, label='Actual')
plt.plot(np.arange(len(close_prices_AAPL_reverse), len(close_prices_AAPL_reverse) + num_predictions), lstm_predictions, label='LSTM Predicted')
plt.title(f"LSTM Model - RMSE: {best_rmse:.2f}")
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()

# Print the predicted stock prices for the next 10 days using LSTM
print("Predicted stock prices for the next 10 days:")
for i, prediction in enumerate(lstm_predictions, start=1):
    print(f"Day {i}: {prediction:.2f}")

オリジナルのコードでは、他の手法による予測も行っています。この手の手法の実用性についてはさらなる調査が必要です。
※ローカルPCで本コードを実行する場合は機械学習ライブラリのインストールが必要ですが、Google Colabではyfinanceライブラリを事前にインストールすれば動きます。