Preparing active learning data

Contents

Preparing active learning data#

from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.set_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="esg", 
    log_level="INFO",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)

INFO:ekorpkit.utils.notebook:Google Colab not detected.

version: 0.1.40.post0.dev58
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 493 ms (started: 2022-12-17 00:29:17 +00:00)

Load data#

data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice/"

chunk_data = eKonf.load_data(
    "econ_news_kr_chunks_*_20220911.parquet", data_dir, concatenate=True
)
print(chunk_data.shape)
chunk_data.head()

(1833910, 3)

	text	chunk_id	filename
0	△ 사진 설명 : 경자년 새해가 밝았다 제각각 삶터에서 묵묵히 노력하는 대한민국...	0	02100101.20200101001103001.txt
1	"청년 벤처기업가들한테 진짜로 미안했어요 부끄럽고 " 지난해 12월 29일 서울...	0	02100101.20200101001253001.txt
2	오늘부터 30년 후면 2050년이다 30년 전인 1990년을 전후하여 세계는 독...	0	02100101.20200101001855001.txt
3	◆ 2020 경제기상도 / 환율 ◆ 지난해 미·중 무역분쟁이라는 대형 변수로 출렁...	0	02100101.20200101040159002.txt
4	◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...	0	02100101.20200101040200001.txt

time: 15.8 s (started: 2022-12-17 00:30:03 +00:00)

chunk_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1833910 entries, 0 to 785336
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   text      object
 1   chunk_id  int64 
 2   filename  object
dtypes: int64(1), object(2)
memory usage: 56.0+ MB
time: 3.92 ms (started: 2022-12-10 07:20:55 +00:00)

Load company code info#

cfg = eKonf.compose("io/loader=plaintext_parser")
cfg.data_dir = data_dir
cfg.data_sources = "econ_news_kr_chunks*.txt"
cfg.data.item = dict(filename="filename", codes="codes")
cfg.parser.split = True
cfg.parser.data_key = "codes"
cfg.parser.progress_per = None
code_info = eKonf.load_data(**cfg)
code_info.tail()

INFO:root:compose config with overrides: ['+io/loader=plaintext_parser']
INFO:ekorpkit.base:setting environment variable CACHED_PATH_CACHE_ROOT to /root/.ekorpkit/.cache/cached_path
INFO:ekorpkit.base:setting environment variable KMP_DUPLICATE_LIB_OK to TRUE
INFO:ekorpkit.io.file:Processing [3] files from ['econ_news_kr_chunks*.txt']
INFO:ekorpkit.io.load.data:==> processing 1/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 37495
INFO:ekorpkit.io.load.data:==> processing 2/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 108675
INFO:ekorpkit.io.load.data:==> processing 3/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 97506

{'filename': 'econ_news_kr_chunks_2022_code_20220911.txt', 'codes': '02100101.20220101002520005,1,005380'}

	filename	codes
243671	econ_news_kr_chunks_2020_code_20220911.txt	02100851.20201231110236001,1,032640
243672	econ_news_kr_chunks_2020_code_20220911.txt	02100851.20201231111427001,1,035720
243673	econ_news_kr_chunks_2020_code_20220911.txt	02100851.20201231132917001,0,011200
243674	econ_news_kr_chunks_2020_code_20220911.txt	02100851.20201231164457001,0,000660
243675	econ_news_kr_chunks_2020_code_20220911.txt	02100851.20201231204324001,0,068270

time: 24.9 s (started: 2022-12-10 07:21:05 +00:00)

code_info_available = code_info.copy()
code_info_available["filename"] = code_info_available.codes.str[:26] + ".txt"
code_info_available["codes"] = code_info_available.codes.str[27:]
code_info_available["codes"] = code_info_available.codes.str.split(",")
code_info_available["chunk_id"] = code_info_available.codes.apply(lambda x: int(x[0]))
code_info_available["codes"] = code_info_available.codes.apply(lambda x: x[1:])
code_info_available["num_codes"] = code_info_available["codes"].apply(len)
code_info_available = code_info_available.explode("codes").reset_index(drop=True)

eKonf.save_data(code_info_available, "econ_news_code_info_available_20220911.parquet", data_dir)
print(code_info_available.shape)
code_info_available.info()

INFO:ekorpkit.io.file:Saving dataframe to ../data/econ_news_kr/news_slice/econ_news_code_info_available_20220911.parquet

(258537, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258537 entries, 0 to 258536
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   filename   258537 non-null  object
 1   codes      258537 non-null  object
 2   chunk_id   258537 non-null  int64 
 3   num_codes  258537 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 7.9+ MB
time: 1.39 s (started: 2022-12-10 07:21:44 +00:00)

filtered_data = chunk_data.merge(
    code_info_available, on=["filename", "chunk_id"], how="inner"
)
cols = ["filename", "chunk_id", "text", "codes"]
filtered_data = filtered_data[cols]
eKonf.save_data(filtered_data, "econ_news_filtered_20220911.parquet", data_dir)
print(filtered_data.shape)
filtered_data.head()

INFO:ekorpkit.io.file:Saving dataframe to ../data/econ_news_kr/news_slice/econ_news_filtered_20220911.parquet

(267111, 4)

	filename	chunk_id	text	codes
0	02100101.20200101040200001.txt	0	◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...	000660
1	02100101.20200101040200001.txt	1	지난해 3분기 반도체 부문 영업이익 3조500억원으로 3조원대에 간신히 턱걸이한 ...	005930
2	02100101.20200101040200002.txt	0	◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...	005930
3	02100101.20200101040200002.txt	0	◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...	066570
4	02100101.20200101040201001.txt	1	디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L...	003550

time: 24.3 s (started: 2022-12-10 07:21:48 +00:00)

Filter out invalid data#

overrides=[
    '+model/transformer=classification',
    '+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_invalid"

cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_invalid_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'econ_news_filtered_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
invalid_preds_df = eKonf.instantiate(cfg)
invalid_preds_df.head()

INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors

	filename	chunk_id	text	codes	pred_labels	raw_preds	pred_probs
0	02100101.20200101040200001.txt	0	◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...	000660	Discarded	{'Discarded': 0.1799878992578683, 'Validated':...	0.206845
1	02100101.20200101040200001.txt	1	지난해 3분기 반도체 부문 영업이익 3조500억원으로 3조원대에 간신히 턱걸이한 ...	005930	Discarded	{'Discarded': 0.6575677396177281, 'Validated':...	0.657568
2	02100101.20200101040200002.txt	0	◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...	005930	Discarded	{'Discarded': 0.13758843340302507, 'Validated'...	0.156780
3	02100101.20200101040200002.txt	0	◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...	066570	Discarded	{'Discarded': 0.13758843340302507, 'Validated'...	0.156780
4	02100101.20200101040201001.txt	1	디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L...	003550	Discarded	{'Discarded': 0.5148541733920216, 'Validated':...	0.514854

cols = ["filename", "chunk_id", "text", "codes"]
valid_data = invalid_preds_df[
    invalid_preds_df.pred_labels == "Validated"
][cols]
print(valid_data.shape)
filename = "esg_news_valid_20220911.parquet"
eKonf.save_data(valid_data, filename, data_dir)

(115784, 4)

Predict polarities#

overrides=[
    '+model/transformer=classification',
    '+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_cv_polarity"
model_cfg.verbose = False

cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_news_polarities_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'esg_news_valid_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
preds_df = eKonf.instantiate(cfg)
preds_df.head()

INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call

Token indices sequence length is longer than the specified maximum sequence length for this model (1492 > 512). Running this sequence through the model will result in indexing errors

	filename	chunk_id	text	codes	pred_labels	raw_preds	pred_probs
5	02100101.20200101040202001.txt	0	◆ 2020 경제기상도 / 업종별 전망 (자동차) ◆ 지난해 국내 자동차 시장은 ...	000270	Negative	{'Neutral': 0.22110844002351862, 'Positive': 0...	0.717613
10	02100101.20200101040206003.txt	0	◆ 2020 경제기상도 / 업종별 전망 (유통) ◆ 2020년은 국내 대표 온·오...	139480	Neutral	{'Neutral': 0.8255044887345607, 'Positive': 0....	0.825504
11	02100101.20200101060214001.txt	0	[Pulse로 배우는 영작문-762] 'Pulse로 배우는 영작문'으로 영문 뉴스...	005380	Neutral	{'Neutral': 0.11524808398723353, 'Positive': 0...	0.126036
18	02100101.20200101183234001.txt	0	제네시스의 첫 번째 스포츠유틸리티차량(SUV) 모델인 GV80가 역동적이면서 우아...	005380	Neutral	{'Neutral': 0.2245705448013361, 'Positive': 0....	0.271877
19	02100101.20200101184423001.txt	0	◆ 2020 유통대전 ① ◆ 유통기업 '빅(Big)2' 롯데쇼핑과 이마트의 새 수...	139480	Neutral	{'Neutral': 0.9101777918673029, 'Positive': 0....	0.910178

Predicting categories#

overrides=[
    '+model/transformer=classification',
    '+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_cv_topics"
model_cfg.verbose = False

cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_news_topics_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'esg_news_valid_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
category_preds_df = eKonf.instantiate(cfg)
category_preds_df.head()

INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call

Token indices sequence length is longer than the specified maximum sequence length for this model (1492 > 512). Running this sequence through the model will result in indexing errors

	filename	chunk_id	text	codes	pred_labels	raw_preds	pred_probs
5	02100101.20200101040202001.txt	0	◆ 2020 경제기상도 / 업종별 전망 (자동차) ◆ 지난해 국내 자동차 시장은 ...	000270	S-고용	{'S-고용': 0.9784880492636585, 'E-환경영향': 0.00082...	0.978488
10	02100101.20200101040206003.txt	0	◆ 2020 경제기상도 / 업종별 전망 (유통) ◆ 2020년은 국내 대표 온·오...	139480	S-소비자	{'S-고용': 0.22015353503574592, 'E-환경영향': 0.0088...	0.559892
11	02100101.20200101060214001.txt	0	[Pulse로 배우는 영작문-762] 'Pulse로 배우는 영작문'으로 영문 뉴스...	005380	S-소비자	{'S-고용': 0.000642749081254596, 'E-환경영향': 0.002...	0.223181
18	02100101.20200101183234001.txt	0	제네시스의 첫 번째 스포츠유틸리티차량(SUV) 모델인 GV80가 역동적이면서 우아...	005380	S-소비자	{'S-고용': 0.0018075713776912943, 'E-환경영향': 0.00...	0.330156
19	02100101.20200101184423001.txt	0	◆ 2020 유통대전 ① ◆ 유통기업 '빅(Big)2' 롯데쇼핑과 이마트의 새 수...	139480	S-고용	{'S-고용': 0.9527512959359385, 'E-환경영향': 0.00096...	0.952751

Import data to labelstudio#

data_dir = ws.project_dir / "data/econ_news_kr/news_slice/"

polarity_preds_df = eKonf.load_data("esg_news_polarities_20220911-preds.parquet", data_dir)
category_preds_df = eKonf.load_data("esg_news_topics_20220911-preds.parquet", data_dir)

INFO:ekorpkit.io.file:Processing [1] files from ['esg_news_polarities_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_polarities_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_polarities_20220911-preds.parquet
INFO:ekorpkit.io.file:Processing [1] files from ['esg_news_topics_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_topics_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_topics_20220911-preds.parquet

time: 3.03 s (started: 2022-12-10 08:34:51 +00:00)

polarity_data = polarity_preds_df.merge(
    category_preds_df.rename(
        columns={"pred_labels": "category_preds"}
    )[['filename', 'chunk_id', 'codes', 'category_preds']],
    on = ['filename', 'chunk_id', 'codes']
).drop(columns=['raw_preds'])
category_data = category_preds_df.merge(
    polarity_preds_df.rename(
        columns={"pred_labels": "polarity_preds"}
    )[['filename', 'chunk_id', 'codes', 'polarity_preds']],
    on = ['filename', 'chunk_id', 'codes']
).drop(columns=['raw_preds'])

time: 235 ms (started: 2022-12-10 08:34:54 +00:00)

from ekorpkit.io.fetch.labelstudio import LabelStudio

# cfg = eKonf.compose("io/fetcher=labelstudio")
# eKonf.print(cfg)
ls = LabelStudio()

INFO:root:compose config with overrides: ['+io/fetcher=labelstudio']
INFO:ekorpkit.config:Initalized batch: labelstudio(2) in /workspace/projects/ekorpkit-book/exmaples/esg/data

time: 756 ms (started: 2022-12-10 08:34:54 +00:00)

project_list = ls.list_projects(verbose=True)

ESG Topic Classification (Sep 2022)
ESG Polarity Classification (Sep 2022)
ESG Topic Classification
ESG Polarity Classification
time: 1.1 s (started: 2022-12-10 08:34:55 +00:00)

ls.delete_project(11)

label_config = """
<View>
  <Header value="Choose text sentiment:"/>
  <Text name="text" value="$text"/>
  <Choices name="sentiment" toName="text" choice="single" showInline="true">
    <Choice value="Positive"/>
    <Choice value="Negative"/>
    <Choice value="Neutral"/>
  </Choices>
</View>
"""

project = ls.create_project(
    title="ESG Polarity Classification (Sep 2022)",
    label_config=label_config,
    color="#D55C9D",
)

pred_file = ls.dataframe_to_predictions(
    polarity_data.sample(frac=0.8),
    prediction_file="polarity_preds.json",
    choices_name="sentiment",
    model_version="1.0",
)
# ls.import_file(pred_file, project_id=12)

INFO:ekorpkit.io.fetch.labelstudio:/workspace/projects/ekorpkit-book/exmaples/esg/data/outputs/labelstudio/labelstudio(2)_polarity_preds.json is saved

time: 7.08 s (started: 2022-12-10 08:34:59 +00:00)

label_config = """
<View>
  <Header value="Choose ESG topic clas:"/>
  <Text name="text" value="$text"/>
  <Choices name="sentiment" toName="text" choice="single" showInline="true">
    <Choice value="E-기후변화"/>
    <Choice value="E-환경영향"/>
    <Choice value="E-환경혁신"/>
    <Choice value="S-재해/안전관리"/>
    <Choice value="S-고용"/>
    <Choice value="S-기업(공급망)동반성장/상생"/>
    <Choice value="S-사회공헌"/>
    <Choice value="S-소비자"/>
    <Choice value="G-지배구조"/>
    <Choice value="G-기업윤리/불공정/소송"/>
    <Choice value="UNKNOWN"/>
  </Choices>
</View>
"""

project = ls.create_project(
    title="ESG Topic Classification (Sep 2022)",
    label_config=label_config,
    color="#51AAFD",
)
print(project["id"], project["title"])

pred_file = ls.dataframe_to_predictions(
    category_data.sample(frac=0.8),
    prediction_file="category_preds.json",
    choices_name="sentiment",
    model_version="1.0",
)
# ls.import_file(pred_file, project_id=project["id"])

INFO:ekorpkit.io.fetch.labelstudio:/workspace/projects/ekorpkit-book/exmaples/esg/data/outputs/labelstudio/labelstudio(2)_category_preds.json is saved

time: 7.16 s (started: 2022-12-10 08:35:06 +00:00)

label_config = """
<View>
  <Header value="Choose text sentiment:"/>
  <Text name="text" value="$text"/>
  <Choices name="sentiment" toName="text" choice="single" showInline="true">
    <Choice value="Positive"/>
    <Choice value="Negative"/>
    <Choice value="Neutral"/>
  </Choices>
</View>
"""

project = ls.create_project(title="ESG Polarity Classification (Sep 2022)", label_config=label_config)