Preparing active learning data#
from ekorpkit import eKonf
if eKonf.is_colab():
eKonf.mount_google_drive()
ws = eKonf.set_workspace(
workspace="/workspace",
project="ekorpkit-book/exmaples",
task="esg",
log_level="INFO",
verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)
INFO:ekorpkit.utils.notebook:Google Colab not detected.
version: 0.1.40.post0.dev58
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 493 ms (started: 2022-12-17 00:29:17 +00:00)
Load data#
data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice/"
chunk_data = eKonf.load_data(
"econ_news_kr_chunks_*_20220911.parquet", data_dir, concatenate=True
)
print(chunk_data.shape)
chunk_data.head()
(1833910, 3)
text | chunk_id | filename | |
---|---|---|---|
0 | △ 사진 설명 : 경자년 새해가 밝았다 제각각 삶터에서 묵묵히 노력하는 대한민국... | 0 | 02100101.20200101001103001.txt |
1 | "청년 벤처기업가들한테 진짜로 미안했어요 부끄럽고 " 지난해 12월 29일 서울... | 0 | 02100101.20200101001253001.txt |
2 | 오늘부터 30년 후면 2050년이다 30년 전인 1990년을 전후하여 세계는 독... | 0 | 02100101.20200101001855001.txt |
3 | ◆ 2020 경제기상도 / 환율 ◆ 지난해 미·중 무역분쟁이라는 대형 변수로 출렁... | 0 | 02100101.20200101040159002.txt |
4 | ◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공... | 0 | 02100101.20200101040200001.txt |
time: 15.8 s (started: 2022-12-17 00:30:03 +00:00)
chunk_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1833910 entries, 0 to 785336
Data columns (total 3 columns):
# Column Dtype
--- ------ -----
0 text object
1 chunk_id int64
2 filename object
dtypes: int64(1), object(2)
memory usage: 56.0+ MB
time: 3.92 ms (started: 2022-12-10 07:20:55 +00:00)
Load company code info#
cfg = eKonf.compose("io/loader=plaintext_parser")
cfg.data_dir = data_dir
cfg.data_sources = "econ_news_kr_chunks*.txt"
cfg.data.item = dict(filename="filename", codes="codes")
cfg.parser.split = True
cfg.parser.data_key = "codes"
cfg.parser.progress_per = None
code_info = eKonf.load_data(**cfg)
code_info.tail()
INFO:root:compose config with overrides: ['+io/loader=plaintext_parser']
INFO:ekorpkit.base:setting environment variable CACHED_PATH_CACHE_ROOT to /root/.ekorpkit/.cache/cached_path
INFO:ekorpkit.base:setting environment variable KMP_DUPLICATE_LIB_OK to TRUE
INFO:ekorpkit.io.file:Processing [3] files from ['econ_news_kr_chunks*.txt']
INFO:ekorpkit.io.load.data:==> processing 1/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 37495
INFO:ekorpkit.io.load.data:==> processing 2/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 108675
INFO:ekorpkit.io.load.data:==> processing 3/3 files <==
INFO:ekorpkit.io.parse.json:Number of data in the contents: 97506
{'filename': 'econ_news_kr_chunks_2022_code_20220911.txt', 'codes': '02100101.20220101002520005,1,005380'}
filename | codes | |
---|---|---|
243671 | econ_news_kr_chunks_2020_code_20220911.txt | 02100851.20201231110236001,1,032640 |
243672 | econ_news_kr_chunks_2020_code_20220911.txt | 02100851.20201231111427001,1,035720 |
243673 | econ_news_kr_chunks_2020_code_20220911.txt | 02100851.20201231132917001,0,011200 |
243674 | econ_news_kr_chunks_2020_code_20220911.txt | 02100851.20201231164457001,0,000660 |
243675 | econ_news_kr_chunks_2020_code_20220911.txt | 02100851.20201231204324001,0,068270 |
time: 24.9 s (started: 2022-12-10 07:21:05 +00:00)
code_info_available = code_info.copy()
code_info_available["filename"] = code_info_available.codes.str[:26] + ".txt"
code_info_available["codes"] = code_info_available.codes.str[27:]
code_info_available["codes"] = code_info_available.codes.str.split(",")
code_info_available["chunk_id"] = code_info_available.codes.apply(lambda x: int(x[0]))
code_info_available["codes"] = code_info_available.codes.apply(lambda x: x[1:])
code_info_available["num_codes"] = code_info_available["codes"].apply(len)
code_info_available = code_info_available.explode("codes").reset_index(drop=True)
eKonf.save_data(code_info_available, "econ_news_code_info_available_20220911.parquet", data_dir)
print(code_info_available.shape)
code_info_available.info()
INFO:ekorpkit.io.file:Saving dataframe to ../data/econ_news_kr/news_slice/econ_news_code_info_available_20220911.parquet
(258537, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258537 entries, 0 to 258536
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 filename 258537 non-null object
1 codes 258537 non-null object
2 chunk_id 258537 non-null int64
3 num_codes 258537 non-null int64
dtypes: int64(2), object(2)
memory usage: 7.9+ MB
time: 1.39 s (started: 2022-12-10 07:21:44 +00:00)
filtered_data = chunk_data.merge(
code_info_available, on=["filename", "chunk_id"], how="inner"
)
cols = ["filename", "chunk_id", "text", "codes"]
filtered_data = filtered_data[cols]
eKonf.save_data(filtered_data, "econ_news_filtered_20220911.parquet", data_dir)
print(filtered_data.shape)
filtered_data.head()
INFO:ekorpkit.io.file:Saving dataframe to ../data/econ_news_kr/news_slice/econ_news_filtered_20220911.parquet
(267111, 4)
filename | chunk_id | text | codes | |
---|---|---|---|---|
0 | 02100101.20200101040200001.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공... | 000660 |
1 | 02100101.20200101040200001.txt | 1 | 지난해 3분기 반도체 부문 영업이익 3조500억원으로 3조원대에 간신히 턱걸이한 ... | 005930 |
2 | 02100101.20200101040200002.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전... | 005930 |
3 | 02100101.20200101040200002.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전... | 066570 |
4 | 02100101.20200101040201001.txt | 1 | 디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L... | 003550 |
time: 24.3 s (started: 2022-12-10 07:21:48 +00:00)
Filter out invalid data#
overrides=[
'+model/transformer=classification',
'+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_invalid"
cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_invalid_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'econ_news_filtered_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
invalid_preds_df = eKonf.instantiate(cfg)
invalid_preds_df.head()
INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
filename | chunk_id | text | codes | pred_labels | raw_preds | pred_probs | |
---|---|---|---|---|---|---|---|
0 | 02100101.20200101040200001.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공... | 000660 | Discarded | {'Discarded': 0.1799878992578683, 'Validated':... | 0.206845 |
1 | 02100101.20200101040200001.txt | 1 | 지난해 3분기 반도체 부문 영업이익 3조500억원으로 3조원대에 간신히 턱걸이한 ... | 005930 | Discarded | {'Discarded': 0.6575677396177281, 'Validated':... | 0.657568 |
2 | 02100101.20200101040200002.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전... | 005930 | Discarded | {'Discarded': 0.13758843340302507, 'Validated'... | 0.156780 |
3 | 02100101.20200101040200002.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전... | 066570 | Discarded | {'Discarded': 0.13758843340302507, 'Validated'... | 0.156780 |
4 | 02100101.20200101040201001.txt | 1 | 디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L... | 003550 | Discarded | {'Discarded': 0.5148541733920216, 'Validated':... | 0.514854 |
cols = ["filename", "chunk_id", "text", "codes"]
valid_data = invalid_preds_df[
invalid_preds_df.pred_labels == "Validated"
][cols]
print(valid_data.shape)
filename = "esg_news_valid_20220911.parquet"
eKonf.save_data(valid_data, filename, data_dir)
(115784, 4)
Predict polarities#
overrides=[
'+model/transformer=classification',
'+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_cv_polarity"
model_cfg.verbose = False
cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_news_polarities_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'esg_news_valid_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
preds_df = eKonf.instantiate(cfg)
preds_df.head()
INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call
Token indices sequence length is longer than the specified maximum sequence length for this model (1492 > 512). Running this sequence through the model will result in indexing errors
filename | chunk_id | text | codes | pred_labels | raw_preds | pred_probs | |
---|---|---|---|---|---|---|---|
5 | 02100101.20200101040202001.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (자동차) ◆ 지난해 국내 자동차 시장은 ... | 000270 | Negative | {'Neutral': 0.22110844002351862, 'Positive': 0... | 0.717613 |
10 | 02100101.20200101040206003.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (유통) ◆ 2020년은 국내 대표 온·오... | 139480 | Neutral | {'Neutral': 0.8255044887345607, 'Positive': 0.... | 0.825504 |
11 | 02100101.20200101060214001.txt | 0 | [Pulse로 배우는 영작문-762] 'Pulse로 배우는 영작문'으로 영문 뉴스... | 005380 | Neutral | {'Neutral': 0.11524808398723353, 'Positive': 0... | 0.126036 |
18 | 02100101.20200101183234001.txt | 0 | 제네시스의 첫 번째 스포츠유틸리티차량(SUV) 모델인 GV80가 역동적이면서 우아... | 005380 | Neutral | {'Neutral': 0.2245705448013361, 'Positive': 0.... | 0.271877 |
19 | 02100101.20200101184423001.txt | 0 | ◆ 2020 유통대전 ① ◆ 유통기업 '빅(Big)2' 롯데쇼핑과 이마트의 새 수... | 139480 | Neutral | {'Neutral': 0.9101777918673029, 'Positive': 0.... | 0.910178 |
Predicting categories#
overrides=[
'+model/transformer=classification',
'+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_cv_topics"
model_cfg.verbose = False
cfg = eKonf.compose(config_group='pipeline')
cfg.name = 'esg_news_topics_20220911'
cfg.data_dir = data_dir
cfg.data_file = 'esg_news_valid_20220911.parquet'
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = data_dir
cfg.predict.output_file = f'{cfg.name}-preds.parquet'
cfg.num_workers = 1
category_preds_df = eKonf.instantiate(cfg)
category_preds_df.head()
INFO:ekorpkit.base:Applying pipe: functools.partial(<function predict at 0x7f2e20d95a60>)
INFO:ekorpkit.base:No method defined to call
Token indices sequence length is longer than the specified maximum sequence length for this model (1492 > 512). Running this sequence through the model will result in indexing errors
filename | chunk_id | text | codes | pred_labels | raw_preds | pred_probs | |
---|---|---|---|---|---|---|---|
5 | 02100101.20200101040202001.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (자동차) ◆ 지난해 국내 자동차 시장은 ... | 000270 | S-고용 | {'S-고용': 0.9784880492636585, 'E-환경영향': 0.00082... | 0.978488 |
10 | 02100101.20200101040206003.txt | 0 | ◆ 2020 경제기상도 / 업종별 전망 (유통) ◆ 2020년은 국내 대표 온·오... | 139480 | S-소비자 | {'S-고용': 0.22015353503574592, 'E-환경영향': 0.0088... | 0.559892 |
11 | 02100101.20200101060214001.txt | 0 | [Pulse로 배우는 영작문-762] 'Pulse로 배우는 영작문'으로 영문 뉴스... | 005380 | S-소비자 | {'S-고용': 0.000642749081254596, 'E-환경영향': 0.002... | 0.223181 |
18 | 02100101.20200101183234001.txt | 0 | 제네시스의 첫 번째 스포츠유틸리티차량(SUV) 모델인 GV80가 역동적이면서 우아... | 005380 | S-소비자 | {'S-고용': 0.0018075713776912943, 'E-환경영향': 0.00... | 0.330156 |
19 | 02100101.20200101184423001.txt | 0 | ◆ 2020 유통대전 ① ◆ 유통기업 '빅(Big)2' 롯데쇼핑과 이마트의 새 수... | 139480 | S-고용 | {'S-고용': 0.9527512959359385, 'E-환경영향': 0.00096... | 0.952751 |
Import data to labelstudio#
data_dir = ws.project_dir / "data/econ_news_kr/news_slice/"
polarity_preds_df = eKonf.load_data("esg_news_polarities_20220911-preds.parquet", data_dir)
category_preds_df = eKonf.load_data("esg_news_topics_20220911-preds.parquet", data_dir)
INFO:ekorpkit.io.file:Processing [1] files from ['esg_news_polarities_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_polarities_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_polarities_20220911-preds.parquet
INFO:ekorpkit.io.file:Processing [1] files from ['esg_news_topics_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_topics_20220911-preds.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit-book/exmaples/esg/data/econ_news_kr/news_slice/esg_news_topics_20220911-preds.parquet
time: 3.03 s (started: 2022-12-10 08:34:51 +00:00)
polarity_data = polarity_preds_df.merge(
category_preds_df.rename(
columns={"pred_labels": "category_preds"}
)[['filename', 'chunk_id', 'codes', 'category_preds']],
on = ['filename', 'chunk_id', 'codes']
).drop(columns=['raw_preds'])
category_data = category_preds_df.merge(
polarity_preds_df.rename(
columns={"pred_labels": "polarity_preds"}
)[['filename', 'chunk_id', 'codes', 'polarity_preds']],
on = ['filename', 'chunk_id', 'codes']
).drop(columns=['raw_preds'])
time: 235 ms (started: 2022-12-10 08:34:54 +00:00)
from ekorpkit.io.fetch.labelstudio import LabelStudio
# cfg = eKonf.compose("io/fetcher=labelstudio")
# eKonf.print(cfg)
ls = LabelStudio()
INFO:root:compose config with overrides: ['+io/fetcher=labelstudio']
INFO:ekorpkit.config:Initalized batch: labelstudio(2) in /workspace/projects/ekorpkit-book/exmaples/esg/data
time: 756 ms (started: 2022-12-10 08:34:54 +00:00)
project_list = ls.list_projects(verbose=True)
13: ESG Topic Classification (Sep 2022)
12: ESG Polarity Classification (Sep 2022)
3: ESG Topic Classification
2: ESG Polarity Classification
time: 1.1 s (started: 2022-12-10 08:34:55 +00:00)
ls.delete_project(11)
label_config = """
<View>
<Header value="Choose text sentiment:"/>
<Text name="text" value="$text"/>
<Choices name="sentiment" toName="text" choice="single" showInline="true">
<Choice value="Positive"/>
<Choice value="Negative"/>
<Choice value="Neutral"/>
</Choices>
</View>
"""
project = ls.create_project(
title="ESG Polarity Classification (Sep 2022)",
label_config=label_config,
color="#D55C9D",
)
pred_file = ls.dataframe_to_predictions(
polarity_data.sample(frac=0.8),
prediction_file="polarity_preds.json",
choices_name="sentiment",
model_version="1.0",
)
# ls.import_file(pred_file, project_id=12)
INFO:ekorpkit.io.fetch.labelstudio:/workspace/projects/ekorpkit-book/exmaples/esg/data/outputs/labelstudio/labelstudio(2)_polarity_preds.json is saved
time: 7.08 s (started: 2022-12-10 08:34:59 +00:00)
label_config = """
<View>
<Header value="Choose ESG topic clas:"/>
<Text name="text" value="$text"/>
<Choices name="sentiment" toName="text" choice="single" showInline="true">
<Choice value="E-기후변화"/>
<Choice value="E-환경영향"/>
<Choice value="E-환경혁신"/>
<Choice value="S-재해/안전관리"/>
<Choice value="S-고용"/>
<Choice value="S-기업(공급망)동반성장/상생"/>
<Choice value="S-사회공헌"/>
<Choice value="S-소비자"/>
<Choice value="G-지배구조"/>
<Choice value="G-기업윤리/불공정/소송"/>
<Choice value="UNKNOWN"/>
</Choices>
</View>
"""
project = ls.create_project(
title="ESG Topic Classification (Sep 2022)",
label_config=label_config,
color="#51AAFD",
)
print(project["id"], project["title"])
pred_file = ls.dataframe_to_predictions(
category_data.sample(frac=0.8),
prediction_file="category_preds.json",
choices_name="sentiment",
model_version="1.0",
)
# ls.import_file(pred_file, project_id=project["id"])
INFO:ekorpkit.io.fetch.labelstudio:/workspace/projects/ekorpkit-book/exmaples/esg/data/outputs/labelstudio/labelstudio(2)_category_preds.json is saved
time: 7.16 s (started: 2022-12-10 08:35:06 +00:00)
label_config = """
<View>
<Header value="Choose text sentiment:"/>
<Text name="text" value="$text"/>
<Choices name="sentiment" toName="text" choice="single" showInline="true">
<Choice value="Positive"/>
<Choice value="Negative"/>
<Choice value="Neutral"/>
</Choices>
</View>
"""
project = ls.create_project(title="ESG Polarity Classification (Sep 2022)", label_config=label_config)