Habe den HMM auf python3 umgestellt, da
- die HMM-Funktion in R nicht mehr konvergiert bzw. eine Fehlermeldung zurückgibt (warum auch immer),
- die Datenprozessierung schneller ist,
- es weniger Speicher braucht.
import os;
import wget; # pip3 install wget
import numpy; # pip3 install numpy
import pandas; # pip3 install pandas
import matplotlib; # pip3 install matplotlib
from hmmlearn import hmm; # pip install --upgrade --user hmmlearn
######################################################################
# weighted mean
def wm(x):
return (x['VOLUME'] * x['PRICE']).sum()/x['VOLUME'].sum();
# removes incomplete weeks
def rm(df, column, n):
for i in range(n):
if df[column].iloc[0] != subdata[column].iloc[i]:
return df.iloc[i:df.shape[0]];
return df;
def fill(df, column):
x = numpy.zeros([df.shape[0]]);
c = 0; x[0] = c;
for i in range(df.shape[0]-1):
if df[column].iloc[i] != df[column].iloc[i+1]:
c += 1;
x[i+1] = c;
return x.astype(int);
######################################################################
# download and/or load bitstamp data
file_name = 'bitstampUSD.csv.gz';
# but make sure, that the file is up to date
if os.path.exists(file_name) == False:
file_name = wget.download('https://api.bitcoincharts.com/v1/csv/bitstampUSD.csv.gz');
data = pandas.read_csv(file_name, compression='gzip', header=None);
data.columns = ['EPOCH', 'PRICE', 'VOLUME'];
data = data.sort_values(by='EPOCH');
data['DATE'] = pandas.to_datetime(data['EPOCH'],unit='s').dt.tz_localize('utc').dt.tz_convert('Europe/Berlin');
data['DAY'] = [str(x) for x in pandas.Series(data['DATE']).dt.date];
# firstly group by day for faster processing
subdata = data.groupby('DAY').first();
subdata['OPEN'] = data.groupby('DAY').first()['PRICE'];
subdata['CLOSE'] = data.groupby('DAY').last()['PRICE'];
subdata['MAX'] = data.groupby('DAY').max()['PRICE'];
subdata['MIN'] = data.groupby('DAY').min()['PRICE'];
subdata['VOLUME'] = data.groupby('DAY').sum()['VOLUME'];
subdata['PRICE'] = data.groupby(['DAY']).apply(wm);
subdata['MA50'] = subdata["MAX"].rolling(50).mean();
subdata['MA200'] = subdata["MAX"].rolling(200).mean();
subdata['RETURN'] = (subdata['CLOSE'] - subdata['OPEN'])/subdata['OPEN'];
subdata['WEEK'] = [str(pandas.Timestamp(x).week) for x in subdata['DATE']];
######################################################################
# not very elegant way to get unique week annotation
# remove first incomplete WEEK
subdata = rm(subdata, 'WEEK', 7);
# remove last incomplete WEEK
subdata = subdata.sort_values(by='EPOCH', ascending=False);
subdata = rm(subdata, 'WEEK', 7);
subdata = subdata.sort_values(by='EPOCH');
subdata['WEEK'] = fill(subdata,'WEEK');
######################################################################
# secondly group by week
subsubdata = subdata.groupby('WEEK').first();
subsubdata['DAY'] = [str(x) for x in pandas.Series(subsubdata['DATE']).dt.date];
subsubdata['OPEN'] = subdata.groupby('WEEK').first()['OPEN'];
subsubdata['CLOSE'] = subdata.groupby('WEEK').last()['CLOSE'];
subsubdata['MAX'] = subdata.groupby('WEEK').max()['MAX'];
subsubdata['MIN'] = subdata.groupby('WEEK').min()['MIN'];
subsubdata['VOLUME'] = subdata.groupby('WEEK').sum()['VOLUME'];
subsubdata['PRICE'] = subdata.groupby(['WEEK']).apply(wm);
subsubdata['LOG10_PRICE'] = numpy.log10(subsubdata['PRICE']);
subsubdata['LOG10_PRICE'] = subsubdata['LOG10_PRICE']/numpy.amax(subsubdata['LOG10_PRICE']);
subsubdata['LOG10_MA50'] = numpy.log10(subsubdata['MA50']);
subsubdata['LOG10_MA50'] = subsubdata['LOG10_MA50']/numpy.amax(subsubdata['LOG10_MA50']);
subsubdata['LOG10_MA200'] = numpy.log10(subsubdata['MA200']);
subsubdata['LOG10_MA200'] = subsubdata['LOG10_MA200']/numpy.amax(subsubdata['LOG10_MA200']);
subsubdata['RETURN'] = (subsubdata['CLOSE'] - subsubdata['OPEN'])/subsubdata['OPEN'];
# at least not wrong ...
subsubdata = subsubdata.sort_values(by='EPOCH');
# create a model and fit it to data
model = hmm.GaussianHMM(4, "diag", n_iter=1000);
X = numpy.asarray(subsubdata['RETURN']).reshape(-1,1);
fit = model.fit(X);
hidden_states = model.predict(X);
hidden_probs = model.predict_proba(X);
df = pandas.DataFrame(hidden_probs);
df.columns = ['0', '1', '2', '3'];
# my sentiment definition
bubble_state = hidden_states[numpy.where(subsubdata['DATE']=='2011-09-19 15:47:03+0200')[0][0]];
sideways_state = hidden_states[numpy.where(subsubdata['DATE']=='2016-10-03 00:00:18+0200')[0][0]];
bullish_state = hidden_states[numpy.where(subsubdata['DATE']=='2013-02-11 00:57:24+0100')[0][0]];
bearish_state = list(set([0,1,2,3]).difference([bubble_state, sideways_state, bullish_state]))[0];
state_index = {'violet':str(bubble_state), 'green':str(bullish_state), 'black':str(sideways_state), 'red':str(bearish_state)};
state_index_= {str(bubble_state):'violet', str(bullish_state):'green', str(sideways_state):'black', str(bearish_state):'red'};
# plot the result
import matplotlib.pyplot as plt;
fig, ax = plt.subplots(figsize=(2.24,2.24));
df[state_index['violet']].plot(ax=ax, color='violet', linewidth=0.6, label='bubble');
df[state_index['green']].plot(ax=ax, color='green', linewidth=0.6, label='bullish');
df[state_index['black']].plot(ax=ax, color='black', linewidth=0.6, label='sideways');
df[state_index['red']].plot(ax=ax, color='red', linewidth=0.6, label='bearish');
subsubdata['LOG10_PRICE'].plot(ax=ax, color='blue', linewidth=0.6, label='log10(price)');
subsubdata['LOG10_MA50'].plot(ax=ax, color='orange', linewidth=0.6, linestyle='dashed', label='log10(MA50)');
subsubdata['LOG10_MA200'].plot(ax=ax, color='orange', linewidth=0.6, label='log10(MA200)');
for i in range(len(hidden_states)):
rect = matplotlib.patches.Rectangle((i-0.5,0),1,-0.1,linewidth=1,edgecolor='none',facecolor=state_index_[str(hidden_states[i])]);
ax.add_patch(rect);
ax.legend();
ticks = numpy.asarray(list(range(subsubdata.shape[0])));
ticks = ticks[ticks%52==0];
plt.xticks(ticks=ticks, labels=subsubdata['DAY'].iloc[ticks], rotation='horizontal');
plt.title('Four states HMM (last week starting at ' + subsubdata['DAY'].iloc[subsubdata.shape[0]-1] + ')');
plt.show();
Plot:
