-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
287 lines (245 loc) · 12.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import numpy as np
import pandas as pd
import streamlit as st
from pandas_profiling import ProfileReport
from streamlit_pandas_profiling import st_profile_report
import kbcstorage as kbc
from kbcstorage.client import Client
from st_aggrid import AgGrid, GridOptionsBuilder
from st_aggrid.shared import GridUpdateMode
import plotly.express as px
import datetime as dt
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
# Web App Title
st.markdown('''
# **The Keboola Storage EDA App**
This is the **EDA App** created in Streamlit using the **pandas-profiling** library.
Simply select a Keboola Storage Bucket and a Keboola Storage Table and the app will generate a profile report.
---
''')
# Enter your Keboola Storage connection details
with st.sidebar.markdown('''
# Enter your Keboola Storage connection details
'''):
# Enter your Keboola Storage connection details
st.session_state['connection_url'] = st.sidebar.selectbox('Connection URL', ['https://connection.keboola.com/', 'https://connection.north-europe.azure.keboola.com/', 'https://connection.eu-central-1.keboola.com/'])
st.session_state['api_token'] = st.sidebar.text_input('API Token', 'Enter Password', type="password")
# Create a Keboola Storage Client
#click to connect
if st.sidebar.button('Connect'):
if 'client' not in st.session_state:
st.session_state['client'] = Client(st.session_state['connection_url'], st.session_state['api_token'])
st.success('Connected to Keboola Storage')
else:
st.sidebar.button('Disconnect', on_click=lambda: st.session_state.pop('client'))
#client = Client(connection_url, api_token)
#try:
# client.buckets.list()
# st.sidebar.success('Connected to Keboola Storage')
#except Exception as e:
# st.sidebar.error('Could not connect to Keboola Storage')
with st.sidebar.header('Select a bucket from storage'):
def get_buckets():
st.session_state['buckets'] = st.session_state['client'].buckets.list()
st.session_state['buckets_df'] = pd.DataFrame(st.session_state['buckets'])
st.session_state['bucket'] = st.sidebar.selectbox('Bucket', st.session_state['buckets_df']['name'])
st.session_state['bucket_id'] = st.session_state['buckets_df'][st.session_state['buckets_df']['name'] == st.session_state['bucket']]['id'].values[0]
return st.session_state['bucket_id']
st.session_state['bucket_id'] = get_buckets()
with st.sidebar.header('Select a table from the bucket'):
# Select a table from the bucket
def get_tables():
st.session_state['tables'] = st.session_state['client'].buckets.list_tables(bucket_id=st.session_state['bucket_id'])
tables_df = pd.DataFrame(st.session_state['tables'])
table = st.sidebar.selectbox('Table', tables_df['name'])
table_id = tables_df[tables_df['name'] == table]['id'].values[0]
return table_id
id = get_tables()
st.session_state['uploaded_file'] = st.session_state['client'].tables.export_to_file(table_id=id, path_name='.')
query_df = pd.read_csv(st.session_state['uploaded_file'] )
########################################################################################################################
######################################### INTERACTIVE TABLE ############################################################
st.header("📊 Explore")
def aggrid_interactive_table(df: pd.DataFrame):
"""Creates an st-aggrid interactive table based on a dataframe.
Args:
df (pd.DataFrame]): Source dataframe
Returns:
dict: The selected row
"""
options = GridOptionsBuilder.from_dataframe(
df, enableRowGroup=True, enableValue=True, enablePivot=True
)
options.configure_side_bar()
options.configure_selection("single")
selection = AgGrid(
df,
enable_enterprise_modules=True,
gridOptions=options.build(),
theme="dark",
height=800,
update_mode=GridUpdateMode.MODEL_CHANGED,
allow_unsafe_jscode=True,
)
return selection # return the selected row
selection = aggrid_interactive_table(df=query_df) # create the table
########################################################################################################################
######################################### MORE OPTIONS SECTION ########################################################
st.write('---')
with st.expander("More Options"):
with st.container():
st.download_button( # add a download button
label="Download data as CSV",
data=st.session_state['uploaded_file'] ,
mime='text/csv',
)
if st.button('Generate Pandas Profiling Report'): # add a button to the sidebar
pr = ProfileReport(query_df, explorative=True)
st.header('**Input DataFrame**')
st.write(query_df)
st.write('---')
st.header('**Pandas Profiling Report**')
st_profile_report(pr)
st.write('---')
########################################################################################################################
######################################### VISUALIZATION SECTION ########################################################
with st.expander("Visualization Builder"):
x = st.selectbox("Select a dimension", query_df.columns, index=0) # add a selectbox to the sidebar
z = st.selectbox("Select a breakdown dimension", query_df.columns, index=1) # add a selectbox to the sidebar
y = st.selectbox("Select a metric", query_df.columns, index=2) # add a selectbox to the sidebar
viz_type = st.selectbox("Select a visualization type", ["line", "bar", "pie", "scatter"]) # add a selectbox to the sidebar
# create a dataframe with the selected dimensions and metrics
if st.button("Generate Visualization"):
if viz_type == 'line':
fig = px.line(query_df, x = query_df[x], y = query_df[y], color = query_df[z])
st.plotly_chart(fig)
elif viz_type == 'bar':
fig = px.bar(query_df, x=x, y=y, color=z)
st.plotly_chart(fig)
elif viz_type == 'pie':
fig = px.pie(query_df, values= query_df[y], names = query_df[z])
st.plotly_chart(fig)
else:
fig = px.scatter(query_df, x=x, y=y, color=z) # create a scatter plot
st.plotly_chart(fig) # plot the figure
st.write('---')
########################################################################################################################
######################################### FORECASTING SECTION ########################################################
with st.expander("Forecasting Area"):
st.header('**Forecasting**')
with st.container():
ds_selection = st.selectbox("Select a date column", query_df.columns)
# create 3 containers for the forecasting area
container1_col1, container1_col2, container1_col3 = st.columns(3)
# Container 1: select the frequency
with container1_col1:
freq = st.selectbox(
"Is the data period monthly or daily?", ("Monthly", "Weekly", "Daily"))
# Container 2: select the number of periods
with container1_col2:
period = st.number_input(
"Number of periods to forecast", value=1, min_value=1, max_value=365)
# Container 3: select the seasonality mode
with container1_col3:
seasonality = st.selectbox(
"Seasonality Mode", ("additive", "multiplicative"))
with st.container():
container2_col1, container2_col2 = st.columns(2)
with container2_col1:
changepoint_prior_scale = st.slider(
"changepoint_prior_scale", 0.01, 0.99, 0.05)
with container2_col2:
changepoint_range = st.slider(
"changepoint_range", 0.01, 0.99, 0.05)
if freq == "Monthly":
freq = "MS"
elif freq == "Weekly":
freq = "W"
else:
freq = "D"
y = st.selectbox(
"Select a target column (it must be numeric)", query_df.columns, index=1)
if st.button("Forecast"):
with st.spinner('Forecasting...'):
ds = query_df[ds_selection]
# group ds by index and sum
#ds = ds.groupby(ds.index).sum()
y = query_df[y]
# cast y to int
y = y.astype(int)
df = pd.DataFrame({'ds': ds, 'y': y})
df = df.groupby('ds').agg('sum')
df = df.reset_index()
st.write(df)
today = dt.datetime.today()
today = pd.to_datetime(today)
today = today.strftime('%Y-%m-%d')
df = df.query('ds <= @today')
m = Prophet(seasonality_mode=seasonality,
changepoint_prior_scale=changepoint_prior_scale, changepoint_range=changepoint_range)
m.fit(df)
future = m.make_future_dataframe(periods=period, freq=freq)
forecast = m.predict(future)
st.write(forecast)
fig1 = m.plot(forecast)
fig2 = m.plot_components(forecast)
st.write('---')
st.header('**Forecast**')
st.pyplot(fig1)
st.write('---')
st.header('**Forecast Components**')
st.write(fig2)
forecasting_params = {
'y': y,
'ds': ds,
'df': df,
'm': m,
'freq': freq,
'period': period,
'seasonality': seasonality,
'changepoint_prior_scale': changepoint_prior_scale,
'changepoint_range': changepoint_range
}
def forecasting_parameters(forecasting_params):
for key, value in forecasting_params.items():
st.session_state[key] = value
forecasting_parameters(forecasting_params)
########################################################################################################################
######################################### CROSS VALIDATION SECTION ########################################################
st.subheader("Cross-validation Parameters")
with st.container():
cv_col1, cv_col2, cv_col3 = st.columns(3)
with cv_col1:
initial = st.number_input("Initial Period", value=4,
min_value=2, max_value=200)
with cv_col2:
period = st.number_input("Number of periods for training",
value=4, min_value=2, max_value=200)
with cv_col3:
horizon = st.number_input("Number of periods for validation",
value=4, min_value=2, max_value=200)
if st.button("Run Cross Validation"):
"st.session_state object:", st.session_state
with st.spinner('Running Cross Validation...'):
def mean_absolute_percentage_error(y_true, y_pred):
'''
Calculates the mean absolute percentage error between two arrays'''
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
m = Prophet(seasonality_mode=st.session_state.seasonality, changepoint_prior_scale=st.session_state.changepoint_prior_scale, changepoint_range=st.session_state.changepoint_range)
m.fit(st.session_state.df)
cv_results = cross_validation(
m, initial=(str(initial) + ' ' + freq), period=(str(period) +' ' +freq), horizon=(str(horizon) + ' ' + freq))
mape_baseline = mean_absolute_percentage_error(
cv_results.y, cv_results.yhat)
st.metric('MAPE', mape_baseline)
df_p = performance_metrics(cv_results)
st.dataframe(df_p)
mape_fig = plot_cross_validation_metric(cv_results, metric='mape')
mse_fig = plot_cross_validation_metric(cv_results, metric='mse')
st.pyplot(mape_fig)
st.pyplot(mse_fig)
st.write(cv_results)
st.write('---')