EDA on Sentiment Data#

%config InlineBackend.figure_format='retina'
from ekorpkit import eKonf

eKonf.setLogger("WARNING")
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())
INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client
INFO:ekorpkit.base:Google Colab not detected.
version: 0.1.35+0.g69734d6.dirty
is notebook? True
is colab? False
evironment varialbles:
{'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_LOG_LEVEL': 'WARNING',
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'NUM_WORKERS': 230}
start_year = 1999
data_dir = "../data/fomc"
eKonf.env().FRED_API_KEY
pydantic.types.SecretStr

Load datasets#

tone_data_lm = eKonf.load_data("fomc_tone_data_lm.parquet", data_dir)
tone_data_lm
polarity_mean_beigebook polarity_mean_meeting_script polarity_mean_minutes polarity_mean_press_conf polarity_mean_speech polarity_mean_statement polarity_mean_testimony polarity_diffusion_beigebook polarity_diffusion_meeting_script polarity_diffusion_minutes ... num_tokens_sum_speech num_tokens_sum_statement num_tokens_sum_testimony num_tokens_mean_beigebook num_tokens_mean_meeting_script num_tokens_mean_minutes num_tokens_mean_press_conf num_tokens_mean_speech num_tokens_mean_statement num_tokens_mean_testimony
date
1990-02-07 NaN -0.087583 NaN NaN NaN NaN NaN NaN -0.095663 NaN ... NaN NaN NaN NaN 30.213010 NaN NaN NaN NaN NaN
1990-03-27 NaN -0.171992 NaN NaN NaN NaN NaN NaN -0.179702 NaN ... NaN NaN NaN NaN 29.846369 NaN NaN NaN NaN NaN
1990-05-15 NaN -0.116052 NaN NaN NaN NaN NaN NaN -0.125461 NaN ... NaN NaN NaN NaN 29.749077 NaN NaN NaN NaN NaN
1990-07-03 NaN -0.114829 NaN NaN NaN NaN NaN NaN -0.117794 NaN ... NaN NaN NaN NaN 29.667920 NaN NaN NaN NaN NaN
1990-08-21 NaN -0.209552 NaN NaN NaN NaN NaN NaN -0.219403 NaN ... NaN NaN NaN NaN 31.032836 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-11-30 NaN NaN NaN NaN -0.167014 NaN -0.12 NaN NaN NaN ... 3066.0 NaN 556.0 NaN NaN NaN NaN 31.937500 NaN 27.8
2021-12-01 -0.046022 NaN NaN NaN NaN NaN NaN -0.048109 NaN NaN ... NaN NaN NaN 22.539497 NaN NaN NaN NaN NaN NaN
2021-12-02 NaN NaN NaN NaN -0.077381 NaN NaN NaN NaN NaN ... 6514.0 NaN NaN NaN NaN NaN NaN 36.188889 NaN NaN
2021-12-15 NaN NaN -0.043929 -0.075441 NaN 0.166667 NaN NaN NaN -0.064286 ... NaN 489.0 NaN NaN NaN 30.521429 37.587413 NaN 27.166667 NaN
2021-12-17 NaN NaN NaN NaN -0.356613 NaN NaN NaN NaN NaN ... 3694.0 NaN NaN NaN NaN NaN NaN 29.317460 NaN NaN

1876 rows × 35 columns

Plot the sentiment scores#

cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plot.y = [
    "polarity_mean_minutes",
    "polarity_mean_press_conf",
    "polarity_mean_speech",
    "polarity_mean_statement",
]
cfg.ax.title = "The polarity scores of the FOMC corpus"
eKonf.instantiate(cfg, data=tone_data_lm)
../../../_images/183e9c624eeb9cdf3692f2bda50cdb2c7297b7097f450414b5fb788fbe86a62c.png
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plot.y = [
    "polarity_diffusion_minutes",
    "polarity_diffusion_press_conf",
    "polarity_diffusion_speech",
    "polarity_diffusion_statement",
]
cfg.ax.title = "The polarity scores of the FOMC corpus"
eKonf.instantiate(cfg, data=tone_data_lm)
../../../_images/98e6bc525d7de16ec25ed4dbecd75470af50acd099629b89dd50a88b4756aee6.png
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plot.y = ["polarity_mean_statement", "polarity_diffusion_statement"]
cfg.ax.title = "The polarity scores of the FOMC statements"
eKonf.instantiate(cfg, data=tone_data_lm)
../../../_images/32419ab379705ef6918ae01b5211478b3afc46fe79d7342059f4491359fc3ce1.png
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plots.append(cfg.plot.copy())
cfg.plots[0].y = "num_tokens_sum_statement"
cfg.plots[0].rcParams = dict(linewidth=2.5, color="red")
cfg.plots[1].y = "num_examples_statement"
cfg.plots[1].rcParams = dict(linewidth=1.5, color="green")
cfg.plots[1].secondary_y = True
cfg.ax.legend = dict(
    labels=[
        "The number of tokens in the FOMC statements (LHS)",
    ],
    loc=2,
)
ax2 = cfg.ax.copy()
ax2.grid = False
ax2.secondary_y = True
ax2.legend = dict(
    labels=["The number of sentences in the FOMC statements (RHS)"],
    loc=1,
)
cfg.axes.append(ax2)

eKonf.instantiate(cfg, data=tone_data_lm)
../../../_images/1e2df3df5c24758eac021cfbac6c68561f6ce83582c8b261a51a83cb957ac233.png
sentiments = [
    "polarity_diffusion_statement",
    "finbert_diffusion_statement",
    "t5_diffusion_statement",
]

cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plot.y = sentiments
cfg.ax.title = "Comparing polarity scores of the FOMC corpus by models"
eKonf.instantiate(cfg, data=merged_tone_data)
../../../_images/120fc998e89e1c3cae4a7c3741c125298fcaa991cea485c27fedea813dea5f45.png
sentiments = [
    "polarity_diffusion_minutes",
    "finbert_diffusion_minutes",
    "t5_diffusion_minutes",
]

cfg = eKonf.compose("visualize/plot=lineplot")
cfg.plot.y = sentiments
cfg.ax.title = "Comparing polarity scores of the FOMC corpus by models"
eKonf.instantiate(cfg, data=merged_tone_data)
../../../_images/ee8de91df754a78a888599bcc32f249fb1c6f3086c253c609f3214388cac581b.png

Plot the results and compare to the economical uncertainty / systemic risk periods#

def plot_sentiments_over_crisis_periods(name="polarity_diffusion_statement", window=2):
    recessions = fomc.recessions.to_dict(orient="records")

    span_args = eKonf.compose("visualize/plot/ax/axvspan")
    annot_args = eKonf.compose("visualize/plot/ax/annotation")
    span_args.color = "crimson"
    span_args.alpha = 0.4

    spans = []
    annotations = []
    for span in recessions:
        annotation = span["name"]
        start = span["from_date"]
        end = span["to_date"]
        span = span_args.copy()
        span.xmin, span.xmax = eKonf.to_dateparm(start), eKonf.to_dateparm(end)

        x = start + (end - start) / 2
        y = -0.75
        annot = annot_args.copy()
        annot.text, annot.x, annot.y = annotation, eKonf.to_dateparm(x), y
        spans.append(span)
        annotations.append(annot)

    merged_tone_data["polarity_ma"] = merged_tone_data[name].rolling(window).mean()

    cfg = eKonf.compose("visualize/plot=lineplot")
    cfg.plots.append(cfg.plot.copy())
    cfg.plots[0].y = "polarity_ma"
    cfg.plots[0].linewidth = 2.5
    cfg.plots[0].palette = "r"
    cfg.plots[1].y = name
    cfg.plots[1].linewidth = 1.5
    cfg.plots[1].palette = "g"
    cfg.ax.title = f"Polarity scores of {name}"
    cfg.ax.legend.labels = [
        f"{window} period MA",
        f"Polarity score of {name}",
    ]
    cfg.ax.axvspans = spans
    cfg.ax.annotations = annotations
    eKonf.instantiate(cfg, data=merged_tone_data)
sentiments = [
    "polarity_diffusion_statement",
    "finbert_diffusion_statement",
    "t5_diffusion_statement",
]
for name in sentiments:
    plot_sentiments_over_crisis_periods(name)
../../../_images/e7b7b217fd79417ac50546d3b6c71eee0da2bbec77fbd457e9eea4275645652d.png ../../../_images/e4a7b35d8e33d343d017fa146c1847adc96c5650ff7b1516160e8fcc9e37c575.png ../../../_images/52cfcf85b8851fbd567195facd689716d860ea6fa43d413f13278dc514a36b70.png
sentiments = [
    "polarity_diffusion_minutes",
    "finbert_diffusion_minutes",
    "t5_diffusion_minutes",
]
for name in sentiments:
    plot_sentiments_over_crisis_periods(name)
../../../_images/2ef6e64ff8a1305bcd29fc10f68bb6df7303294b7115d1b961568f1078f48ae8.png ../../../_images/b905fd992e8a7bc0c8c3d2df095d3fbb695cb8e54e2e293abff1031056a54438.png ../../../_images/445d8693815e44b49297dd192232a07836ca1ffc177167ac7168e6fca10b9358.png
from datetime import datetime

chair = fomc.chairpersons
chair = chair[chair.to_date.dt.year >= start_year]


def plot_sentiments_over_chair_periods(name="polarity_diffusion_statement", window=3):
    annot_args = eKonf.compose("visualize/plot/ax/annotation")

    spans = []
    annotations = []
    for row in chair.iterrows():
        row_num = row[0]
        start = row[1]["from_date"]
        if start.year < start_year:
            start = datetime(2000, 1, 1)
        end = row[1]["to_date"]
        annotation = row[1]["last_name"]
        color = "crimson" if row_num % 2 == 0 else "darkblue"
        alpha = 0.4 if row_num % 2 == 0 else 0.2
        span = {
            "xmin": eKonf.to_dateparm(start),
            "xmax": eKonf.to_dateparm(end),
            "alpha": alpha,
            "color": color,
        }

        x = start + (end - start) / 2
        y = -0.75
        annot = annot_args.copy()
        annot.text, annot.x, annot.y = annotation, eKonf.to_dateparm(x), y

        spans.append(span)
        annotations.append(annot)

    merged_tone_data["polarity_ma"] = merged_tone_data[name].rolling(window).mean()

    cfg = eKonf.compose("visualize/plot=lineplot")
    ax2 = cfg.ax.copy()

    cfg.plots.append(cfg.plot.copy())
    cfg.plots[0].y = "polarity_ma"
    cfg.plots[0].rcParams = dict(linewidth=2.5, color="red")
    cfg.plots[1].y = name
    cfg.plots[1].rcParams = dict(linewidth=1.5, color="green")
    cfg.ax.title = f"Polarity scores of {name}"
    cfg.ax.legend = dict(
        labels=[
            f"{window} period MA",
            f"Polarity score of {name}",
        ],
        loc=2,
    )
    scatter_cfg = eKonf.compose("visualize/plot/scatterplot")
    scatter_cfg.y = "rate"
    scatter_cfg.secondary_y = True
    scatter_cfg.rcParams = dict(color="blue", alpha=0.8)
    cfg.plots.append(scatter_cfg)

    ax2.grid = False
    ax2.secondary_y = True
    ax2.ylim = "(-4, 10)"
    ax2.legend = dict(
        labels=["Fed Rate"],
        loc=1,
    )
    cfg.axes.append(ax2)
    cfg.ax.axvspans = spans
    cfg.ax.annotations = annotations

    eKonf.instantiate(cfg, data=merged_tone_data)
sentiments = [
    "polarity_diffusion_statement",
    "finbert_diffusion_statement",
    "t5_diffusion_statement",
]
for name in sentiments:
    plot_sentiments_over_chair_periods(name)
../../../_images/9ce0286b5565e32515e4fac603a281509321a545fc3bceff5d6b0f197ffd34b9.png ../../../_images/958cf80d504d111551c4dbcbb21ce2039decc7b1398ac951f761a1ec8d66031c.png ../../../_images/dfefdf1a314a248729080c670d135492991c6613ff4973e74963e651098ceb31.png
sentiments = [
    "polarity_diffusion_minutes",
    "finbert_diffusion_minutes",
    "t5_diffusion_minutes",
]
for name in sentiments:
    plot_sentiments_over_chair_periods(name)
../../../_images/70155c141cd2a4a38fa29e3e3997d55376b8a23c87b6d6c6f41d52bda44c1f6a.png ../../../_images/342d4a098c875d755d9672737bc01126891556e82811c3f90e42477ada5bb208.png ../../../_images/264de6704848baabd60952ace8f83257ef93cfd886dc86b5ec953ccc2493c7e3.png
sentiments = ["lm_tones", "finbert_tones", "t5_tones"]
for name in sentiments:
    plot_sentiments_over_chair_periods(name)
../../../_images/3069c2118b9089819a80c6d30db38d8f00021c611ff09a37c59ee369246bb52b.png ../../../_images/b4cf798778e68fa69a90602ce93cd3dd3b0d69110a59c6e988d90d9bc5f34c2d.png ../../../_images/cef20b63b33c61728fb6b5b3a6ca15458734562971293cc6c01d02f619885584.png