Visualizing Features

Visualizing Features#

%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.32+22.g1490fd9.dirty
data_dir = "../data/fomc"

Build and load a feature set#

cfg = eKonf.compose("dataset=feature_build")
cfg.name = "fomc_features_small"
cfg.data_dir = data_dir
cfg.data_file = "econ_train_small.parquet"
cfg.force.build = True
cfg.pipeline.split_sampling.stratify_on = "target"
cfg.pipeline.split_sampling.random_state = 123
cfg.pipeline.split_sampling.test_size = 0.25
cfg.pipeline.reset_index.index_column_name = "date"
cfg.column_info.columns.index = "index"
cfg.column_info.columns.id = "date"
cfg.column_info.columns.x = [
    "prev_decision",
    "GDP_diff_prev",
    "PMI",
    "EMP_diff_prev",
    "RSALES_diff_year",
    "UNEMP_diff_prev",
    "HSALES_diff_year",
    "Inertia_diff",
    "Balanced_diff",
]
cfg.column_info.columns.y = "target"
f_small = eKonf.instantiate(cfg)
f_small.persist()
INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('load_dataframe', 'load_dataframe'), ('reset_index', 'reset_index'), ('split_sampling', 'split_sampling')])
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function load_dataframe at 0x7fd4706e8310>)
INFO:ekorpkit.io.file:Processing [1] files from ['econ_train_small.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/fomc/econ_train_small.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/fomc/econ_train_small.parquet
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function reset_index at 0x7fd4706e7670>)
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function split_sampling at 0x7fd4706e7280>)
INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_features_small/fomc_features_small-train.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_features_small/fomc_features_small-test.parquet
INFO:ekorpkit.datasets.feature:Dataset fomc_features_small built with 415 rows
INFO:ekorpkit.io.file:Processing [1] files from ['fomc_features_small-train.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/fomc/fomc_features_small/fomc_features_small-train.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/fomc/fomc_features_small/fomc_features_small-train.parquet
INFO:ekorpkit.info.column:index: index, index of data: None, columns: ['date', 'target', 'prev_decision', 'GDP_diff_prev', 'PMI', 'EMP_diff_prev', 'RSALES_diff_year', 'UNEMP_diff_prev', 'HSALES_diff_year', 'Inertia_diff', 'Balanced_diff'], id: ['date']
INFO:ekorpkit.info.column:Adding id [split] to ['date']
INFO:ekorpkit.info.column:Added id [split], now ['date', 'split']
INFO:ekorpkit.info.column:Added a column [split] with value [train]
WARNING:ekorpkit.datasets.base:File fomc_features_small-dev.parquet not found.
INFO:ekorpkit.io.file:Processing [1] files from ['fomc_features_small-test.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/fomc/fomc_features_small/fomc_features_small-test.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/fomc/fomc_features_small/fomc_features_small-test.parquet
INFO:ekorpkit.info.column:Added a column [split] with value [test]
INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:00.004594
INFO:ekorpkit.info.stat: >> elapsed time to calculate statistics: 0:00:00.003903
INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_features_small/fomc_features_small-train.parquet
INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_features_small/fomc_features_small-test.parquet
f_small.train_data
date target prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff split
index
0 2002-09-24 Hold 0.0 0.615502 50.3 -0.002297 2.206995 -1.724138 17.090069 0.0 0.0 train
1 1997-03-25 Hike 0.0 1.038398 53.1 0.257080 3.197859 -1.886792 4.161248 0.0 0.0 train
2 2011-04-27 Hold 0.0 0.519595 59.1 0.179317 4.107119 0.000000 -21.259843 0.0 0.0 train
3 1989-02-08 Hike 1.0 1.332501 54.7 0.246029 2.094256 1.886792 22.905983 0.0 0.0 train
4 2008-09-16 Hold 0.0 0.572295 49.2 -0.202195 -4.803606 5.172414 -37.768240 0.0 0.0 train
... ... ... ... ... ... ... ... ... ... ... ... ...
306 1998-08-18 Hold 0.0 0.925821 49.2 0.102319 2.026188 0.000000 8.819876 0.0 0.0 train
307 2021-04-28 Hold 0.0 1.115277 63.7 0.489818 26.945039 -3.225806 44.426230 0.0 0.0 train
308 2004-01-28 Hold 0.0 1.666896 60.1 0.087374 2.493088 -1.724138 7.729008 0.0 0.0 train
309 2015-06-17 Hold 0.0 0.813314 52.8 0.234429 2.662704 3.703704 11.308204 0.0 0.0 train
310 2007-08-16 Hold 0.0 0.638435 52.4 -0.022453 0.622821 2.173913 -19.378238 0.0 0.0 train

311 rows × 12 columns

cfg = eKonf.compose(config_group="dataset=feature")
cfg.name = "fomc_features_small"
cfg.data_dir = data_dir
f_small = eKonf.instantiate(cfg)
f_small.INFO
{'splits': {'train': {'data_file': 'fomc_features_small-train.parquet',
   'name': 'train',
   'num_examples': 311,
   'num_bytes': 63207,
   'human_bytes': '61.73 KiB'},
  'test': {'data_file': 'fomc_features_small-test.parquet',
   'name': 'test',
   'num_examples': 104,
   'num_bytes': 21118,
   'human_bytes': '20.62 KiB'}},
 'num_examples': 415,
 'size_in_bytes': 84325,
 'size_in_human_bytes': '82.35 KiB',
 'data_files': {'train': 'fomc_features_small-train.parquet',
  'test': 'fomc_features_small-test.parquet'},
 'meta_files': {},
 'data_files_modified': '2022-06-14 10:23:24',
 'info_updated': '2022-06-14 10:23:24',
 'column_info': {'_keys_': {'index': 'index',
   'id': 'id',
   'x': 'x',
   'y': 'y',
   'split': 'split',
   'dataset': 'dataset'},
  'columns': {'index': 'date',
   'id': ['date', 'split'],
   'x': ['prev_decision',
    'GDP_diff_prev',
    'PMI',
    'EMP_diff_prev',
    'RSALES_diff_year',
    'UNEMP_diff_prev',
    'HSALES_diff_year',
    'Inertia_diff',
    'Balanced_diff'],
   'y': 'target'},
  'datetime': {'columns': None,
   'format': None,
   'rcParams': None,
   '_parms_': None},
  'data': {'date': 'datetime64[ns]',
   'target': 'object',
   'prev_decision': 'float64',
   'GDP_diff_prev': 'float64',
   'PMI': 'float64',
   'EMP_diff_prev': 'float64',
   'RSALES_diff_year': 'float64',
   'UNEMP_diff_prev': 'float64',
   'HSALES_diff_year': 'float64',
   'Inertia_diff': 'float64',
   'Balanced_diff': 'float64',
   'split': 'str'},
  'keys': {'index': 'date', 'id': 'id', 'x': 'x', 'y': 'y', 'split': 'split'}}}
y_train = f_small.y_train
print(f_small.COLUMN.Y)
y_train[:5]
target
date
0    Hold
1    Hold
2    Hold
3     Cut
4     Cut
Name: target, dtype: object
X_train = f_small.X_train
print(f_small.COLUMN.X)
X_train[:5]
['prev_decision', 'GDP_diff_prev', 'PMI', 'EMP_diff_prev', 'RSALES_diff_year', 'UNEMP_diff_prev', 'HSALES_diff_year', 'Inertia_diff', 'Balanced_diff']
prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff
date
0 0.0 7.547535 57.7 0.234047 3.055276 -2.898551 22.622478 0.0 0.0
1 0.0 0.843712 51.9 0.058919 0.196754 -2.173913 -21.960415 0.0 0.0
2 1.0 0.364447 54.7 0.221891 4.213922 -5.000000 -8.387800 0.0 0.0
3 -1.0 0.743327 51.2 0.124735 2.094256 -4.285714 13.643411 0.0 0.0
4 0.0 1.284129 56.1 0.221892 3.239959 -3.278689 16.625310 0.0 0.0
cfg = eKonf.compose("dataset=feature_build")
cfg.name = "fomc_features_large"
cfg.data_dir = data_dir
cfg.data_file = "econ_train_large.parquet"
cfg.force.rebuild = True
cfg.pipeline.split_sampling.stratify_on = "target"
cfg.pipeline.split_sampling.random_state = 123
cfg.pipeline.split_sampling.test_size = 0.25
cfg.pipeline.reset_index.index_column_name = "date"
cfg.column_info.columns.index = "date"
cfg.column_info.columns.id = "index"
cfg.column_info.columns.x = [
    "prev_decision",
    "GDP_diff_prev",
    "GDP_diff_year",
    "GDPPOT_diff_prev",
    "GDPPOT_diff_year",
    "PCE_diff_prev",
    "PCE_diff_year",
    "CPI_diff_prev",
    "CPI_diff_year",
    "UNEMP",
    "UNEMP_diff_prev",
    "UNEMP_diff_year",
    "EMP",
    "EMP_diff_prev",
    "EMP_diff_year",
    "PMI",
    "PMI_diff_prev",
    "PMI_diff_year",
    "RSALES_diff_prev",
    "RSALES_diff_year",
    "HSALES_diff_prev",
    "HSALES_diff_year",
    "Taylor-Rate",
    "Balanced-Rate",
    "Inertia-Rate",
    "Taylor_diff",
    "Balanced_diff",
    "Inertia_diff",
]
cfg.column_info.columns.y = "target"
cfg.verbose = False
f_large = eKonf.instantiate(cfg)
f_large.persist()

Visualize Features#

cfg = eKonf.compose(config_group="dataset=feature")
cfg.name = "fomc_features_small"
cfg.data_dir = data_dir
f_small = eKonf.instantiate(cfg)
f_small.train_data.head()
date target prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff split
date
0 2002-09-24 Hold 0.0 0.615502 50.3 -0.002297 2.206995 -1.724138 17.090069 0.0 0.0 train
1 1997-03-25 Hike 0.0 1.038398 53.1 0.257080 3.197859 -1.886792 4.161248 0.0 0.0 train
2 2011-04-27 Hold 0.0 0.519595 59.1 0.179317 4.107119 0.000000 -21.259843 0.0 0.0 train
3 1989-02-08 Hike 1.0 1.332501 54.7 0.246029 2.094256 1.886792 22.905983 0.0 0.0 train
4 2008-09-16 Hold 0.0 0.572295 49.2 -0.202195 -4.803606 5.172414 -37.768240 0.0 0.0 train
f_small.y_train = f_small.transform_labels(f_small.y_train)
f_small.y_dev = f_small.transform_labels(f_small.y_dev)
f_small.y_test = f_small.transform_labels(f_small.y_test)
X_cols = f_small.COLUMN.X
y_col = f_small.COLUMN.Y
print(f_small.classes, X_cols, y_col)
['Cut', 'Hike', 'Hold'] ['prev_decision', 'GDP_diff_prev', 'PMI', 'EMP_diff_prev', 'RSALES_diff_year', 'UNEMP_diff_prev', 'HSALES_diff_year', 'Inertia_diff', 'Balanced_diff'] target
f_small.train_data.head()
date target prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff split
date
0 2002-09-24 2 0.0 0.615502 50.3 -0.002297 2.206995 -1.724138 17.090069 0.0 0.0 train
1 1997-03-25 1 0.0 1.038398 53.1 0.257080 3.197859 -1.886792 4.161248 0.0 0.0 train
2 2011-04-27 2 0.0 0.519595 59.1 0.179317 4.107119 0.000000 -21.259843 0.0 0.0 train
3 1989-02-08 1 1.0 1.332501 54.7 0.246029 2.094256 1.886792 22.905983 0.0 0.0 train
4 2008-09-16 2 0.0 0.572295 49.2 -0.202195 -4.803606 5.172414 -37.768240 0.0 0.0 train
# cfg = eKonf.compose(config_group="visualize/plot=radviz")
# cfg.plots[0].x = X_cols
# cfg.plots[0].y = y_col
# cfg.plots[0].classes = f_small.classes
# cfg.figure.figsize = (10, 6)
# cfg.figure.fontsize = 14
# eKonf.instantiate(cfg, data=f_small.data)
cfg = eKonf.compose(config_group="visualize/plot=jointplot")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].columns = ["PMI", "GDP_diff_prev"]
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
WARNING:matplotlib.font_manager:findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
WARNING:matplotlib.font_manager:findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
../../../_images/debe9a721383540d21e38abbe0ef3a77bc427158b7573e15ad146c44eb0f42b8.png
cfg = eKonf.compose(config_group="visualize/plot=rank1d")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].classes = f_small.classes
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
../../../_images/3d9ecd3dcad7ebeeb441b88bd779c80df0e0bf1f39eed86c37d453bd122f8051.png
cfg = eKonf.compose(config_group="visualize/plot=rank2d")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
../../../_images/d873b7f32a8bdbd479e8b233f92a0ff37b800f1181d7e3a926febc8081c30b2e.png
cfg = eKonf.compose("visualize/plot=pcoords")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].classes = f_small.classes
cfg.plots[0].sample = 0.8
cfg.plots[0].shuffle = False
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 10
eKonf.instantiate(cfg, data=f_small.data)
../../../_images/5708725f30e38a1307c756bbdf6ecad39627977487f89afd2f7bf01b4740de96.png
cfg = eKonf.compose(config_group="visualize/plot=pca")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].classes = f_small.classes
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
../../../_images/018579615cac78923113dce292e386324385ec8268ed8c6f45c79b468af9afd7.png
cfg = eKonf.compose(config_group="visualize/plot=manifold")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].classes = f_small.classes
cfg.plots[0].manifold="isomap"
cfg.plots[0].n_neighbors=4
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
../../../_images/889299642535480566b19a819b1231734cc2bd6cdabad05c428280f6ee06d0a5.png
cfg = eKonf.compose(config_group="visualize/plot=manifold")
cfg.plots[0].x = X_cols
cfg.plots[0].y = y_col
cfg.plots[0].classes = f_small.classes
cfg.plots[0].manifold = "tsne"
cfg.figure.figsize = (10, 6)
cfg.figure.fontsize = 14
eKonf.instantiate(cfg, data=f_small.data)
/opt/conda/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.
  warnings.warn(
../../../_images/93c771ad58b411a89813fad2416ac289b111303a1cbc53f5ed901320e8ccde23.png