EDA on Numerical Data#
%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf
logging.basicConfig(level=logging.WARNING)
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())
version: 0.1.33+20.g8433774.dirty
is notebook? True
is colab? False
evironment varialbles:
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
'EKORPKIT_DATA_DIR': None,
'EKORPKIT_PROJECT': 'ekorpkit-book',
'EKORPKIT_WORKSPACE_ROOT': '/workspace',
'NUM_WORKERS': 230}
data_dir = "../data/fomc"
Load preprocessed data#
econ_data = eKonf.load_data("econ_data2.parquet", data_dir)
econ_data.tail()
unscheduled | forecast | confcall | speaker | rate | rate_change | rate_decision | rate_changed | GDP | GDP_diff_prev | ... | Rate | Taylor | Balanced | Inertia | Taylor-Rate | Balanced-Rate | Inertia-Rate | Taylor_diff | Balanced_diff | Inertia_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||||
2021-11-03 | False | False | False | Jerome Powell | 0.25 | 0.00 | 0.0 | 0 | 19478.893 | 0.570948 | ... | 0.25 | 5.747177 | 4.940210 | -0.528532 | 5.497177 | 4.690210 | -0.778532 | 0.0 | 0.0 | 0.0 |
2021-12-15 | False | True | False | Jerome Powell | 0.25 | 0.00 | 0.0 | 0 | 19478.893 | 0.570948 | ... | 0.25 | 6.472329 | 5.665362 | -0.637304 | 6.222329 | 5.415362 | -0.887304 | 0.0 | 0.0 | 0.0 |
2022-01-26 | False | False | False | Jerome Powell | 0.25 | 0.00 | 0.0 | 0 | 19478.893 | 0.570948 | ... | 0.25 | 7.222928 | 6.415961 | -0.749894 | 6.972928 | 6.165961 | -0.999894 | 0.0 | 0.0 | 0.0 |
2022-03-16 | False | True | False | Jerome Powell | 0.50 | 0.25 | 1.0 | 1 | 19806.290 | 1.680778 | ... | 0.25 | 8.499377 | 8.267766 | -1.027665 | 8.249377 | 8.017766 | -1.277665 | 0.0 | 0.0 | 0.0 |
2022-05-04 | False | False | False | Jerome Powell | 1.00 | 0.50 | 1.0 | 1 | 19735.895 | -0.355417 | ... | 0.50 | 8.094924 | 7.420939 | -0.688141 | 7.594924 | 6.920939 | -1.188141 | 0.0 | 0.0 | 0.0 |
5 rows × 58 columns
EDA on numerical data#
# Add previous rate decision to see inertia effect
econ_data["Rate Decision"] = econ_data["rate_decision"].map(
lambda x: "Cut" if x <= -1 else "Hike" if x >= 1 else "Hold"
)
econ_data["rate_decision"] = econ_data["rate_decision"].map(
lambda x: -1 if x <= -1 else 1 if x >= 1 else 0
)
econ_data["prev_decision"] = econ_data["rate_decision"].shift(1)
econ_data["next_decision"] = econ_data["rate_decision"].shift(-1)
econ_data[["Rate Decision", "rate_decision", "prev_decision", "next_decision"]].head()
Rate Decision | rate_decision | prev_decision | next_decision | |
---|---|---|---|---|
date | ||||
1982-10-05 | Cut | -1 | NaN | -1.0 |
1982-11-16 | Cut | -1 | -1.0 | 0.0 |
1982-12-21 | Hold | 0 | -1.0 | 0.0 |
1983-01-14 | Hold | 0 | 0.0 | 0.0 |
1983-01-21 | Hold | 0 | 0.0 | 0.0 |
Compare distributions by rate decisions#
def plot_distribution(data, columns):
for col in columns:
cfg = eKonf.compose("visualize/plot=kdeplot")
cfg.figure.figsize = (8, 4)
cfg.kdeplot.x = col
cfg.kdeplot.hue = "Rate Decision"
cfg.kdeplot.palette = "tab10"
cfg.ax.legend = ["Hike", "Hold", "Cut"]
cfg.ax.grid = False
cfg.ax.title = f"Distribution of {cfg.kdeplot.x}"
cfg.ax.ylabel = "Frequency"
cfg.ax.xlabel = cfg.kdeplot.x
eKonf.instantiate(cfg, data=data)
cfg = eKonf.compose(config_group="visualize/plot=facetgrid")
cfg.figure.figsize = (8, 4)
cfg.theme.palette = "pastel"
cfg.facetgrid.col = "Rate Decision"
cfg.facetgrid.height = 3
cfg.facetgrid.map_dataframe._func_ = "histplot"
cfg.facetgrid.map_dataframe.rcParams = dict(x=col, bins=50, kde=True)
eKonf.instantiate(cfg, data=data)
plot_distribution(
econ_data,
["GDP_diff_prev", "GDP_diff_year", "GDPPOT_diff_prev", "GDPPOT_diff_year"],
)
plot_distribution(
econ_data, ["PCE_diff_prev", "PCE_diff_year", "CPI_diff_prev", "CPI_diff_year"]
)
plot_distribution(
econ_data,
[
"UNEMP",
"UNEMP_diff_prev",
"UNEMP_diff_year",
"EMP",
"EMP_diff_prev",
"EMP_diff_year",
],
)
plot_distribution(
econ_data,
["RSALES_diff_prev", "RSALES_diff_year", "HSALES_diff_prev", "HSALES_diff_year"],
)
Create Training Data Set#
columns = [
"Rate Decision",
"prev_decision",
"GDP_diff_prev",
"PMI",
"EMP_diff_prev",
"RSALES_diff_year",
"UNEMP_diff_prev",
"HSALES_diff_year",
"Inertia_diff",
"Balanced_diff",
]
econ_train_small = econ_data.copy()[columns]
econ_train_small.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_small.shape)
econ_train_small.tail()
(415, 10)
target | prev_decision | GDP_diff_prev | PMI | EMP_diff_prev | RSALES_diff_year | UNEMP_diff_prev | HSALES_diff_year | Inertia_diff | Balanced_diff | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2021-11-03 | Hold | 0.0 | 0.570948 | 60.5 | 0.288624 | 8.474656 | -9.615385 | -26.135217 | 0.0 | 0.0 |
2021-12-15 | Hold | 0.0 | 0.570948 | 60.6 | 0.437147 | 10.977142 | -8.695652 | -11.163337 | 0.0 | 0.0 |
2022-01-26 | Hold | 0.0 | 0.570948 | 58.8 | 0.395555 | 9.101289 | -7.142857 | -3.673938 | 0.0 | 0.0 |
2022-03-16 | Hike | 0.0 | 1.680778 | 58.6 | 0.476814 | 9.076698 | -5.000000 | 3.125000 | 0.0 | 0.0 |
2022-05-04 | Hike | 1.0 | -0.355417 | 57.1 | 0.283658 | -0.034915 | 0.000000 | -26.946848 | 0.0 | 0.0 |
# Large dataset
columns = [
"Rate Decision",
"prev_decision",
"GDP_diff_prev",
"GDP_diff_year",
"GDPPOT_diff_prev",
"GDPPOT_diff_year",
"PCE_diff_prev",
"PCE_diff_year",
"CPI_diff_prev",
"CPI_diff_year",
"UNEMP",
"UNEMP_diff_prev",
"UNEMP_diff_year",
"EMP",
"EMP_diff_prev",
"EMP_diff_year",
"PMI",
"PMI_diff_prev",
"PMI_diff_year",
"RSALES_diff_prev",
"RSALES_diff_year",
"HSALES_diff_prev",
"HSALES_diff_year",
"Taylor-Rate",
"Balanced-Rate",
"Inertia-Rate",
"Taylor_diff",
"Balanced_diff",
"Inertia_diff",
]
econ_train_large = econ_data.copy()[columns]
econ_train_large.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_large.shape)
(415, 29)
Missing Values#
# As most likely the decision is 0 (hold), fill prev_decision of the first row
econ_train_small["prev_decision"].fillna(0, inplace=True)
econ_train_large["prev_decision"].fillna(0, inplace=True)
# ax.set_xlim(0, 400)
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.figure.figsize = (15, 8)
cfg.lineplot.x = "date"
cfg.lineplot.y = "HSALES_diff_year"
lineplot = cfg.lineplot.copy()
lineplot.x = "date"
lineplot.y = "RSALES_diff_year"
cfg.plots.append(lineplot)
eKonf.instantiate(cfg, data=econ_train_small)
# Retail sales growth ratio is difficult to estimate. Though it is not ideal, simply use the average
econ_train_small["RSALES_diff_year"].fillna(
econ_train_small["RSALES_diff_year"].mean(), inplace=True
)
econ_train_large["RSALES_diff_prev"].fillna(
econ_train_large["RSALES_diff_prev"].mean(), inplace=True
)
econ_train_large["RSALES_diff_year"].fillna(
econ_train_large["RSALES_diff_year"].mean(), inplace=True
)
econ_train_small["Inertia_diff"].fillna(
econ_train_small["Inertia_diff"].mean(), inplace=True
)
econ_train_small["Balanced_diff"].fillna(
econ_train_small["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Inertia_diff"].fillna(
econ_train_large["Inertia_diff"].mean(), inplace=True
)
econ_train_large["Balanced_diff"].fillna(
econ_train_large["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Taylor_diff"].fillna(
econ_train_large["Taylor_diff"].mean(), inplace=True
)
Save Data#
eKonf.save_data(econ_train_small, "econ_train_small.parquet", data_dir)
eKonf.save_data(econ_train_large, "econ_train_large.parquet", data_dir)