公众号:尤而小屋
编辑:Peter
作者:Peter
大家好,我是Peter~
介绍一个基于深度学习实战项目:基于长短期记忆模型LSTM的股价预测,包含:
如何通过yfinance下载金融股票数据 成交量、收盘价可视化 如何生成股价的5日、10日平均值 股价日收益的计算 基于LSTM建模预测收盘价等
LSTM的介绍
1、https://easyai.tech/ai-definition/lstm/
2、https://zh.d2l.ai/chapter_recurrent-modern/lstm.html
导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline
from pandas_datareader.data import DataReader
# 专门用来获取金融股票数据的第三方包
import yfinance as yf
from pandas_datareader import data as pdr
yf.pdr_override()
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler # 数据归一化
import warnings
warnings.filterwarnings("ignore")
生成数据
基于yfinance生成数据:
tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN'] # 指定4个公司
end = datetime.now() # 股票时间设置
start = datetime(end.year - 3, end.month, end.day)
for stock in tech_list: # tech_list = ['AAPL', 'GOOG', 'MSFT', 'AMZN']
globals()[stock] = yf.download(stock, start, end) # 指定公司名称+时间
company_list = [AAPL, GOOG, MSFT, AMZN]
company_name = ["APPLE", "GOOGLE", "MICROSOFT", "AMAZON"]
for company, com_name in zip(company_list, company_name):
company["company_name"] = com_name
df = pd.concat(company_list, axis=0)
df.tail()
[*********************100%%**********************] 1 of 1 completed
[*********************100%%**********************] 1 of 1 completed
[*********************100%%**********************] 1 of 1 completed
[*********************100%%**********************] 1 of 1 completed
数据信息
AAPL.head() # 4个DataFrame: 'AAPL', 'GOOG', 'MSFT', 'AMZN'
AAPL.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 752 entries, 2021-09-03 to 2024-08-30
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 752 non-null float64
1 High 752 non-null float64
2 Low 752 non-null float64
3 Close 752 non-null float64
4 Adj Close 752 non-null float64
5 Volume 752 non-null int64
6 company_name 752 non-null object
dtypes: float64(5), int64(1), object(1)
memory usage: 47.0+ KB
收盘价 Closing Price
plt.figure(figsize=(15, 10))
plt.subplots_adjust(top=1.25, bottom=1.2)
for i, company in enumerate(company_list, 1):
plt.subplot(2,2,i)
company["Adj Close"].plot()
plt.ylabel("Adj Close")
plt.xlabel(None)
plt.title(f"Closing Price of {tech_list[i - 1]}")
plt.tight_layout()
成交量 Volume of Sales
plt.figure(figsize=(15, 10))
plt.subplots_adjust(top=1.25, bottom=1.2)
for i, company in enumerate(company_list, 1):
plt.subplot(2,2,i)
company["Volume"].plot()
plt.ylabel("Volume")
plt.xlabel(None)
plt.title(f"Sales Volume of {tech_list[i - 1]}")
plt.tight_layout()
不同股票的移动平均值Moving Average of Stocks
增加移动平均字段
ma_day = [5,10,20,50]
for ma in ma_day:
for company in company_list:
column_name = f"MA for {ma} days"
company[column_name] = company["Adj Close"].rolling(ma).mean() # 滑动平均ma天后的均值
company.columns
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'company_name',
'MA for 5 days', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days'],
dtype='object')
可视化效果
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_figheight(10)
fig.set_figwidth(15)
AAPL[['Adj Close', 'MA for 5 days', 'MA for 10 days','MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,0])
axes[0,0].set_title('APPLE')
GOOG[['Adj Close', 'MA for 5 days', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[0,1])
axes[0,1].set_title('GOOGLE')
MSFT[['Adj Close', 'MA for 5 days', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,0])
axes[1,0].set_title('MICROSOFT')
AMZN[['Adj Close', 'MA for 5 days', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(ax=axes[1,1])
axes[1,1].set_title('AMAZON')
fig.tight_layout()
从上图中可以观察到,5日和10日均线能够更好地捕捉到数据的变化趋势。
日收益daily return
pct_change 函数在 pandas 中非常有用,特别是在处理时间序列数据时。这个函数计算序列中每个元素与前一个元素之间的百分比变化。它通常用于财务数据分析,比如计算股票价格、货币汇率、销售量等的日百分比变化率。
计算过程pct_change
for company in company_list:
company["Daily Return"] = company["Adj Close"].pct_change() # pct_change当前元素和前一个元素的百分比变化
company.head()
可视化(散点图)
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.set_figheight(10)
fig.set_figwidth(15)
AAPL['Daily Return'].plot(ax=axes[0,0], legend=True, linestyle='--', marker='o')
axes[0,0].set_title('APPLE')
GOOG['Daily Return'].plot(ax=axes[0,1], legend=True, linestyle='--', marker='o')
axes[0,1].set_title('GOOGLE')
MSFT['Daily Return'].plot(ax=axes[1,0], legend=True, linestyle='--', marker='o')
axes[1,0].set_title('MICROSOFT')
AMZN['Daily Return'].plot(ax=axes[1,1], legend=True, linestyle='--', marker='o')
axes[1,1].set_title('AMAZON')
fig.tight_layout()
可视化(直方图hist)
plt.figure(figsize=(12, 9))
for i, company in enumerate(company_list, 1):
plt.subplot(2, 2, i) # 子图位置
# 绘图数据调用hist函数
company['Daily Return'].hist(bins=50) # 直方图箱体个数
plt.xlabel('Daily Return') # xy轴标题
plt.ylabel('Counts')
plt.title(f'{company_name[i - 1]}') # 图的标题
plt.tight_layout()
相关性
单独生成股价的收盘价作为closing_df:
# pandas_datareader pdr
closing_df = pdr.get_data_yahoo(tech_list, start=start, end=end)['Adj Close']
closing_df.head()
[*********************100%%**********************] 4 of 4 completed
对closing_df使用pct_change函数:
tech_rets = closing_df.pct_change()
tech_rets.head()
可视化展示
自相关
sns.jointplot(x='GOOG', y='GOOG', data=tech_rets, kind='scatter', color='seagreen')
plt.show()
两两相关
sns.jointplot(x='GOOG', y='MSFT', data=tech_rets, kind='scatter', color='blue')
plt.show()
sns.jointplot(x='GOOG', y='AMZN', data=tech_rets, kind='scatter', color='blue')
plt.show()
整体相关性(sns.pairplot)
sns.pairplot(tech_rets, kind='reg')
plt.show()
整体相关性(sns.PairGrid)
使用sns.PairGrid()可以让我们更灵活地控制图表的布局和类型:
return_fig = sns.PairGrid(tech_rets.dropna())
return_fig.map_upper(plt.scatter, color='purple')
return_fig.map_lower(sns.kdeplot, cmap='cool_d')
return_fig.map_diag(plt.hist, bins=30)
plt.show()
收盘价的整体相关性:
returns_fig = sns.PairGrid(closing_df) # 收盘价
returns_fig.map_upper(plt.scatter,color='purple')
returns_fig.map_lower(sns.kdeplot,cmap='cool_d')
returns_fig.map_diag(plt.hist,bins=30)
plt.show()
热力图heatmap
plt.figure(figsize=(12, 10))
plt.subplot(2, 2, 1)
sns.heatmap(tech_rets.corr(), annot=True, cmap='summer') # 日回报率
plt.title('Correlation of stock return')
plt.subplot(2, 2, 2)
sns.heatmap(closing_df.corr(), annot=True, cmap='summer') # 收盘价
plt.title('Correlation of stock closing price')
plt.show()
风险评估value risk
有许多方法可以量化风险,使用收集到的每日百分比回报率信息来量化风险的最基本方法之一,是通过将预期回报率与每日回报率的标准差进行比较。
rets = tech_rets.dropna()
area = np.pi * 20
plt.figure(figsize=(10,8))
plt.scatter(rets.mean(), rets.std(),s=area)
plt.xlabel("Expected Return")
plt.ylabel("Risk")
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
plt.annotate(label, xy=(x, y), xytext=(50, 50), textcoords='offset points', ha='right', va='bottom',
arrowprops=dict(arrowstyle='-', color='blue', connectionstyle='arc3,rad=-0.3'))
建模预测Predicting the closing price
项目地址:https://www.kaggle.com/code/faressayah/stock-market-analysis-prediction-using-lstm
生成数据
df = pdr.get_data_yahoo("AAPL", start="2012-01-01", end=datetime.now())
df.tail()
可视化
plt.figure(figsize=(16,6))
plt.plot(df['Close']) # 绘图使用数据
plt.xlabel('Date', fontsize=18) # x-y轴标题
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.title('Close Price History') # 图的标题
plt.show()
提取Close数据
data = df.filter(["Close"])
dataset = data.values
dataset[:5]
array([[14.6867857 ],
[14.76571369],
[14.92964268],
[15.08571434],
[15.0617857 ]])
训练集数据长度
training_dataset_len = int(np.ceil(len(dataset) * 0.95))
training_dataset_len
3027
数据归一化
from sklearn.preprocessing import MinMaxScaler # 数据归一化MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset) # 整体数据的归一化过程
scaled_data
array([[0.00334711],
[0.00370446],
[0.00444665],
...,
[0.96228593],
[0.97722662],
[0.97364992]])
选择训练集数据
选择指定长度下的训练集数据:
train_data = scaled_data[0:int(training_dataset_len), :]
切分训练集
将train_data切分成x_train和y_train:
x_train, y_train = [], []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
if i <= 61:
print(x_train)
print(y_train)
print()
x_train, y_train = np.array(x_train), np.array(y_train)
# shape转换
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
构建LSTM网络
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(128,return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
编译&训练网络compile+fit
# 编译
model.compile(optimizer="adam", loss="mean_squared_error")
# 训练
model.fit(x_train, y_train, batch_size=1, epochs=1)
2967/2967 [==============================] - 36s 11ms/step - loss: 7.2025e-04
测试集生成
test_data = scaled_data[training_dataset_len-60:, :]
x_test = []
y_test = dataset[training_dataset_len:, :]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i, 0])
x_test = np.array(x_test)
# shape转换
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
预测过程predict
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions) # 将数据进行还原
5/5 [==============================] - 1s 16ms/step
计算RMSE
rmse = np.sqrt(np.mean(((predictions - y_test)**2)))
rmse
4.422715565017136
预测可视化
train = data[:training_dataset_len] # 训练集
valid = data[training_dataset_len:] # 验证集
valid["Predictions"] = predictions
plt.figure(figsize=(16,6))
plt.plot(train['Close'])
plt.plot(valid[['Close', 'Predictions']])
# xy轴标题、图例、图标题生成
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.title('Model')
plt.show()