EDA on Numerical Data#

%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())
version: 0.1.33+20.g8433774.dirty
is notebook? True
is colab? False
evironment varialbles:
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'NUM_WORKERS': 230}
data_dir = "../data/fomc"

Load preprocessed data#

econ_data = eKonf.load_data("econ_data2.parquet", data_dir)
econ_data.tail()
unscheduled forecast confcall speaker rate rate_change rate_decision rate_changed GDP GDP_diff_prev ... Rate Taylor Balanced Inertia Taylor-Rate Balanced-Rate Inertia-Rate Taylor_diff Balanced_diff Inertia_diff
date
2021-11-03 False False False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 5.747177 4.940210 -0.528532 5.497177 4.690210 -0.778532 0.0 0.0 0.0
2021-12-15 False True False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 6.472329 5.665362 -0.637304 6.222329 5.415362 -0.887304 0.0 0.0 0.0
2022-01-26 False False False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 7.222928 6.415961 -0.749894 6.972928 6.165961 -0.999894 0.0 0.0 0.0
2022-03-16 False True False Jerome Powell 0.50 0.25 1.0 1 19806.290 1.680778 ... 0.25 8.499377 8.267766 -1.027665 8.249377 8.017766 -1.277665 0.0 0.0 0.0
2022-05-04 False False False Jerome Powell 1.00 0.50 1.0 1 19735.895 -0.355417 ... 0.50 8.094924 7.420939 -0.688141 7.594924 6.920939 -1.188141 0.0 0.0 0.0

5 rows × 58 columns

EDA on numerical data#

# Add previous rate decision to see inertia effect
econ_data["Rate Decision"] = econ_data["rate_decision"].map(
    lambda x: "Cut" if x <= -1 else "Hike" if x >= 1 else "Hold"
)
econ_data["rate_decision"] = econ_data["rate_decision"].map(
    lambda x: -1 if x <= -1 else 1 if x >= 1 else 0
)
econ_data["prev_decision"] = econ_data["rate_decision"].shift(1)
econ_data["next_decision"] = econ_data["rate_decision"].shift(-1)
econ_data[["Rate Decision", "rate_decision", "prev_decision", "next_decision"]].head()
Rate Decision rate_decision prev_decision next_decision
date
1982-10-05 Cut -1 NaN -1.0
1982-11-16 Cut -1 -1.0 0.0
1982-12-21 Hold 0 -1.0 0.0
1983-01-14 Hold 0 0.0 0.0
1983-01-21 Hold 0 0.0 0.0

Compare distributions by rate decisions#

def plot_distribution(data, columns):
    for col in columns:
        cfg = eKonf.compose("visualize/plot=kdeplot")
        cfg.figure.figsize = (8, 4)

        cfg.kdeplot.x = col
        cfg.kdeplot.hue = "Rate Decision"
        cfg.kdeplot.palette = "tab10"
        cfg.ax.legend = ["Hike", "Hold", "Cut"]
        cfg.ax.grid = False
        cfg.ax.title = f"Distribution of {cfg.kdeplot.x}"
        cfg.ax.ylabel = "Frequency"
        cfg.ax.xlabel = cfg.kdeplot.x
        eKonf.instantiate(cfg, data=data)

        cfg = eKonf.compose(config_group="visualize/plot=facetgrid")
        cfg.figure.figsize = (8, 4)
        cfg.theme.palette = "pastel"
        cfg.facetgrid.col = "Rate Decision"
        cfg.facetgrid.height = 3
        cfg.facetgrid.map_dataframe._func_ = "histplot"
        cfg.facetgrid.map_dataframe.rcParams = dict(x=col, bins=50, kde=True)
        eKonf.instantiate(cfg, data=data)
plot_distribution(
    econ_data,
    ["GDP_diff_prev", "GDP_diff_year", "GDPPOT_diff_prev", "GDPPOT_diff_year"],
)
../../../_images/6e557488007513482e39ef0f762e46236d79dbc448914cbeb5d3db2225cd70c5.png ../../../_images/a850f4e13def559fd45d738ed560890d48b12aafb324a2917e1a926a61a344c0.png ../../../_images/56d117cfcc23c91bc57fa485ee611c16f88c737acb7228a7bd39e103e6e5e60a.png ../../../_images/02a89aaa91dcab028cf77a3ee6c59a1d366ca41463e4dded6ab12a60b6f7e82a.png ../../../_images/2a9ea3762cd1456e19ff825f6ceaafce7bccfd768d5cae20d0753bfc89b328f7.png ../../../_images/d7ed4af7924977d88a4420395ae3f42ad25026376a062ad315435d83b8b4edc7.png ../../../_images/0468447e3768363fccbd674deb5f468a64111bb24cca38f868e64f558c9e47e2.png ../../../_images/8ef43d2eb7a6ad259c5cd89fde7f6dddc9ed676b3060ac027e06538c390d85ca.png
plot_distribution(
    econ_data, ["PCE_diff_prev", "PCE_diff_year", "CPI_diff_prev", "CPI_diff_year"]
)
../../../_images/61934b6d91af7e5cd78c0f80578a2c97f10be507d89a82420a817bae9c5bfae2.png ../../../_images/f36b2161030635c538a53fd763029ce45437977315c935a4f247c591753ed9b7.png ../../../_images/7e44522989d7784471fe5c0ac58a96bcd678c2e6072d6bd4e3cc0f272f8cabc5.png ../../../_images/40d611156747214372d7749b1bdca21024ff827943f2d68d6ad71ae95af10601.png ../../../_images/07515dfe4bfd16174866fda4d3dc3a4f252be66b7a586032200f6ef214ae2dc2.png ../../../_images/6bade0e6c3398c1ecbaed40a37a87db075e254dbd0aaef5775b5063f426c8e52.png ../../../_images/3e03f697372dda9b1556d14bf80fd3a3e782de3d034064c365e11276eacd6af6.png ../../../_images/fbc121d1afa6401838eefce8a327ffb05435b054f29022c1e2ad4c5498c41944.png
plot_distribution(
    econ_data,
    [
        "UNEMP",
        "UNEMP_diff_prev",
        "UNEMP_diff_year",
        "EMP",
        "EMP_diff_prev",
        "EMP_diff_year",
    ],
)
../../../_images/ccbd7678c5196dc785d97ae858c41e55d49ffd50787464ce71c826fea1768f56.png ../../../_images/f92101f6dee0367af5d7321592c12ce826c992037db9bfa11503e65f2cd33402.png ../../../_images/980c9815af2579793819c6d6d3358ed3ad08317b2fb0a93d3a4e37b3dc2885d2.png ../../../_images/0db12bad1456410c1f617ccc3439918c03fad2e1e9cebbad7b82c6bd1437cad7.png ../../../_images/8a6d81af14956a517ec6a0450b7fff8198c23d819d52fac2b525fad7cf507b25.png ../../../_images/333bb59b093994270be3f80382a04ed5cd86f9eaacf5f0658b3ec6261e82e65a.png ../../../_images/ff45679ff2a7b02ec9027398536bd9809f936da27a859f3af578f3892e969e93.png ../../../_images/720034136997d7c3f371540ade5842c43d3c54766334e34fc6d9dac26a65a2bf.png ../../../_images/1a1c91296544c36757127bdd68d134b430c7cf4967fc459e6f37275ca899ab8d.png ../../../_images/3f6c60b6560c74fa3109d87d5fbfe2951ccbecbe561ee74eab1e57520983bcbe.png ../../../_images/484dffef03eda05d1eca054e7ceb3c8ec0775dfd29e141a6f5b829cb1c1da354.png ../../../_images/3cf99fe0fc7d26c0d5cabd5620151975fb851349479b08787878d57bfa338b69.png
plot_distribution(econ_data, ["PMI", "PMI_diff_prev", "PMI_diff_year"])
../../../_images/ca97c4d36f5d408127583689e57fb2b1abaa51dbbc440492c62bdfc678e053cc.png ../../../_images/c128c24cd6d437de8ed5b79fe306b79f5281e95aa07067d4bb40de77906616b7.png ../../../_images/30efe9cbba4ada5bf718a68883b98694f21a698f8fc7e772e2586bef467a0384.png ../../../_images/7ac8930dcab345c97a3465b5f03b7ef0878ec8bbe955a371b070e27c795951d5.png ../../../_images/daea4c4829368f93253ed6bf33f48dc61c1b2531acae747cda568de4997b187f.png ../../../_images/b8235482eb5aabf6df419214a42dc6bb8885d491b9e877a9bf5ff5a032a44f6c.png
plot_distribution(
    econ_data,
    ["RSALES_diff_prev", "RSALES_diff_year", "HSALES_diff_prev", "HSALES_diff_year"],
)
../../../_images/6690167d643d7697d39875d8601cc9d372f2c872fc9d43bc9353d8a4f5189420.png ../../../_images/7d7dbaf4f93f41a03c902e5adb11114ea9931806622c88d15ccde7fd24a0822b.png ../../../_images/3009b838a9a4da06aa255f09f0fb050968abb2a862e1aa24e5924bf6d0878163.png ../../../_images/d48985fafc103649092edb92d840e7cb3b81ba9d6b7669adf6037dc9f42b2907.png ../../../_images/30fdf33aafbecd34a1f080468804a0c6d937f9e16cee8a7b4cb430708d172f8e.png ../../../_images/198656bb58c8b6f255673ad72f75476cf8f613257dd3c17c7bc7ffa21a0cf867.png ../../../_images/844e4aa34c01537b92b43dd39856f4c963493db0523ebbcc8f3fed0586515eb5.png ../../../_images/0bf1e4e9795ee1b601ac11f1aa863f876466b2ff304f630d9151a5bbafc32697.png
plot_distribution(econ_data, ["Taylor_diff", "Balanced_diff", "Inertia_diff"])
../../../_images/2d16155071fb8618da119c68cf5aac2aeaffa094f5a85ce717fdc749d9e67a64.png ../../../_images/ddce65197b4296ee287f1aab0fde79e99e2b06bd0568ea32bb38a88dabb55a6b.png ../../../_images/a9e9d368976f83bc6e47e4d4d68721cf0184d2d97f87544d5ad90d7b07700e6e.png ../../../_images/d8d9c48213ec268834fd95455ae94586ca375d945ef5f414fcc6b76d0d348bda.png ../../../_images/d4fbf8e3545dcfcf49d8bde67921aa2487be428a5b723f1513475bdaf418d1c3.png ../../../_images/1d202c3e6f01de131c3237b231840be779ba733e8cb559375b6c5bfb5df43ce5.png

Create Training Data Set#

columns = [
    "Rate Decision",
    "prev_decision",
    "GDP_diff_prev",
    "PMI",
    "EMP_diff_prev",
    "RSALES_diff_year",
    "UNEMP_diff_prev",
    "HSALES_diff_year",
    "Inertia_diff",
    "Balanced_diff",
]

econ_train_small = econ_data.copy()[columns]
econ_train_small.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_small.shape)
econ_train_small.tail()
(415, 10)
target prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff
date
2021-11-03 Hold 0.0 0.570948 60.5 0.288624 8.474656 -9.615385 -26.135217 0.0 0.0
2021-12-15 Hold 0.0 0.570948 60.6 0.437147 10.977142 -8.695652 -11.163337 0.0 0.0
2022-01-26 Hold 0.0 0.570948 58.8 0.395555 9.101289 -7.142857 -3.673938 0.0 0.0
2022-03-16 Hike 0.0 1.680778 58.6 0.476814 9.076698 -5.000000 3.125000 0.0 0.0
2022-05-04 Hike 1.0 -0.355417 57.1 0.283658 -0.034915 0.000000 -26.946848 0.0 0.0
# Large dataset
columns = [
    "Rate Decision",
    "prev_decision",
    "GDP_diff_prev",
    "GDP_diff_year",
    "GDPPOT_diff_prev",
    "GDPPOT_diff_year",
    "PCE_diff_prev",
    "PCE_diff_year",
    "CPI_diff_prev",
    "CPI_diff_year",
    "UNEMP",
    "UNEMP_diff_prev",
    "UNEMP_diff_year",
    "EMP",
    "EMP_diff_prev",
    "EMP_diff_year",
    "PMI",
    "PMI_diff_prev",
    "PMI_diff_year",
    "RSALES_diff_prev",
    "RSALES_diff_year",
    "HSALES_diff_prev",
    "HSALES_diff_year",
    "Taylor-Rate",
    "Balanced-Rate",
    "Inertia-Rate",
    "Taylor_diff",
    "Balanced_diff",
    "Inertia_diff",
]


econ_train_large = econ_data.copy()[columns]
econ_train_large.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_large.shape)
(415, 29)

Missing Values#

# As most likely the decision is 0 (hold), fill prev_decision of the first row
econ_train_small["prev_decision"].fillna(0, inplace=True)
econ_train_large["prev_decision"].fillna(0, inplace=True)
# ax.set_xlim(0, 400)
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.figure.figsize = (15, 8)
cfg.lineplot.x = "date"
cfg.lineplot.y = "HSALES_diff_year"

lineplot = cfg.lineplot.copy()
lineplot.x = "date"
lineplot.y = "RSALES_diff_year"
cfg.plots.append(lineplot)

eKonf.instantiate(cfg, data=econ_train_small)
../../../_images/c163d2564de6c56e08959789b76ea1b9f31687f2735c54365a2ddc080ff6fe8d.png
# Retail sales growth ratio is difficult to estimate. Though it is not ideal, simply use the average
econ_train_small["RSALES_diff_year"].fillna(
    econ_train_small["RSALES_diff_year"].mean(), inplace=True
)
econ_train_large["RSALES_diff_prev"].fillna(
    econ_train_large["RSALES_diff_prev"].mean(), inplace=True
)
econ_train_large["RSALES_diff_year"].fillna(
    econ_train_large["RSALES_diff_year"].mean(), inplace=True
)
econ_train_small["Inertia_diff"].fillna(
    econ_train_small["Inertia_diff"].mean(), inplace=True
)
econ_train_small["Balanced_diff"].fillna(
    econ_train_small["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Inertia_diff"].fillna(
    econ_train_large["Inertia_diff"].mean(), inplace=True
)
econ_train_large["Balanced_diff"].fillna(
    econ_train_large["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Taylor_diff"].fillna(
    econ_train_large["Taylor_diff"].mean(), inplace=True
)

Save Data#

eKonf.save_data(econ_train_small, "econ_train_small.parquet", data_dir)
eKonf.save_data(econ_train_large, "econ_train_large.parquet", data_dir)