Improving classification datasets#
from ekorpkit import eKonf
eKonf.setLogger("INFO")
print("version:", eKonf.__version__)
is_colab = eKonf.is_colab()
print("is colab?", is_colab)
if is_colab:
eKonf.mount_google_drive()
workspace_dir = "/workspace"
project_name = "ekorpkit-book/exmaples/esg"
ws = eKonf.set_workspace(workspace=workspace_dir, project=project_name)
print("project_dir:", ws.project_dir)
ws.envs.dict()
Show code cell output
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Set environment variable EKORPKIT_PROJECT=ekorpkit-book/exmaples/esg
INFO:ekorpkit.base:Set environment variable EKORPKIT_PROJECT_DIR=/workspace/projects/ekorpkit-book/exmaples/esg
version: 0.1.40.post0.dev50
is colab? False
INFO:root:compose config with overrides: ['project=default']
INFO:ekorpkit.base:There are no arguments to initilize a config, using default config.
project_dir: /workspace/projects/ekorpkit-book/exmaples/esg
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
'EKORPKIT_WORKSPACE_ROOT': '/workspace',
'EKORPKIT_PROJECT': 'ekorpkit-book/exmaples/esg',
'EKORPKIT_PROJECT_DIR': '/workspace/projects/ekorpkit-book/exmaples/esg',
'EKORPKIT_DATA_DIR': None,
'EKORPKIT_LOG_LEVEL': 'INFO',
'NUM_WORKERS': 230,
'KMP_DUPLICATE_LIB_OK': 'TRUE',
'CUDA_DEVICE_ORDER': None,
'CUDA_VISIBLE_DEVICES': None,
'WANDB_PROJECT': None,
'WANDB_DISABLED': None,
'LABEL_STUDIO_SERVER': 'http://ekorpkit-labelstudio:8080',
'CACHED_PATH_CACHE_ROOT': None}
time: 1.1 s (started: 2022-12-12 10:24:03 +00:00)
Preparing esg_topics
dataset#
ds_cfg = eKonf.compose('dataset')
ds_cfg.name = 'esg_topics'
ds_cfg.data_dir = '/workspace/data/datasets/simple'
eKonf.print(ds_cfg)
# ds = eKonf.instantiate(ds_cfg)
# labels = list(ds.splits['train'].labels.unique())
# print(labels)
INFO:root:compose config with overrides: ['dataset=default']
{'_target_': 'ekorpkit.datasets.dataset.Dataset',
'auto': {'build': False, 'load': True},
'column_info': {'_target_': 'ekorpkit.info.column.DatasetInfo',
'columns': {'id': 'id', 'text': 'text'},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None}},
'data_dir': '/workspace/data/datasets/simple',
'filetype': '.parquet',
'force': {'build': False},
'info': {'_target_': 'ekorpkit.info.stat.SummaryInfo',
'aggregate_info': {'num_examples': 'num_examples',
'size_in_bytes': 'num_bytes'},
'data_dir': '/workspace/data/datasets/simple',
'info_file': 'info-esg_topics.yaml',
'info_list': ['name',
'fullname',
'domain',
'task',
'lang',
'description',
'license',
'homepage',
'version',
'num_examples',
'size_in_bytes',
'size_in_human_bytes',
'data_files_modified',
'info_updated',
'data_files',
'column_info'],
'key_columns': None,
'modified_info': {'data_files_modified': 'data_file'},
'name': 'esg_topics',
'stats': {'_func_': {'len_bytes': {'_partial_': True,
'_target_': 'ekorpkit.utils.func.len_bytes'}},
'_partial_': True,
'_target_': 'ekorpkit.info.stat.summary_stats',
'agg_funcs': {'num_bytes': ['count',
'sum',
'median',
'max',
'min']},
'convert_to_humanbytes': {'num_bytes': 'human_bytes'},
'key_columns': None,
'num_columns': {'num_bytes': 'len_bytes'},
'num_workers': 1,
'rename_columns': {'num_bytes_count': 'num_examples',
'num_bytes_sum': 'num_bytes'},
'text_keys': 'text'},
'update_files_info': {'data_files': 'data_file',
'meta_files': 'meta_file'},
'update_info': ['fullname',
'lang',
'domain',
'task',
'description',
'license',
'homepage',
'version'],
'verbose': False},
'name': 'esg_topics',
'path': {'cache': {'cache_dir': '/root/.ekorpkit/.cache',
'extract_archive': True,
'force_extract': False,
'path': None,
'return_parent_dir': True,
'uri': None,
'verbose': False},
'cached_path': None,
'columns': None,
'concat_data': False,
'data_columns': None,
'data_dir': '/workspace/data/datasets/simple',
'data_file': None,
'filetype': '.parquet',
'name': 'esg_topics',
'output_dir': '/root/.ekorpkit/projects/ekorpkit-book/exmaples/esg/esg_topics/outputs',
'output_file': None,
'root': '/root/.ekorpkit/projects/ekorpkit-book/exmaples/esg/esg_topics',
'suffix': None,
'verbose': False},
'use_name_as_subdir': True,
'verbose': False}
time: 599 ms (started: 2022-12-12 10:27:28 +00:00)
remap_cat = {
'E-신재생에너지 발전': 'E-환경혁신',
'E-원자력발전': 'E-환경혁신',
'S-기술혁신': 'S-소비자',
'S-노조/노사': 'S-고용',
'S-인적자본': 'S-고용',
'S-산업재해/안전관리': 'S-재해/안전관리',
'G-정보공시': 'NA',
'G-주주환원': 'NA',
}
remap_cat
{'E-신재생에너지 발전': 'E-환경혁신',
'E-원자력발전': 'E-환경혁신',
'S-기술혁신': 'S-소비자',
'S-노조/노사': 'S-고용',
'S-인적자본': 'S-고용',
'S-산업재해/안전관리': 'S-재해/안전관리',
'G-정보공시': 'NA',
'G-주주환원': 'NA'}
for split, data in ds._splits.items():
data['labels'] = data['labels'].map(remap_cat).fillna(data['labels'])
data = data[data.labels != 'NA']
ds._splits[split] = data
ds.save_as("esg_topics_remapped")
INFO:ekorpkit.base:Using batcher with minibatch size: 39
INFO:ekorpkit.base:Using batcher with minibatch size: 5
INFO:ekorpkit.base:Using batcher with minibatch size: 5
ds_cfg.name = "esg_topics_remapped"
ds = eKonf.instantiate(ds_cfg)
labels = list(ds.splits['train'].labels.unique())
print(labels)
['S-기업(공급망)동반성장/상생', 'G-지배구조', 'G-기업윤리/불공정/소송', 'S-소비자', 'E-환경혁신', 'S-사회공헌', 'S-고용', 'E-환경영향', 'E-기후변화', 'S-재해/안전관리']
Cross validation of esg_topics
dataset#
overrides=[
'+model/transformer=classification',
'+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_topics"
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 32
model_cfg.config.eval_batch_size = 32
model_cfg._method_ = []
# model_cfg.model.eval.visualize.plot.confusion_matrix.include_values = False
# model_cfg.model.eval.visualize.plot.confusion_matrix.include_percentages = False
# model_cfg.model.eval.visualize.plot.figure.figsize = (12,10)
model = eKonf.instantiate(model_cfg)
INFO:ekorpkit.base:No method defined to call
cv_preds = model.cross_val_predict(cv=5)
eKonf.save_data(cv_preds, "esg_topics_cv_preds.parquet", data_dir)
Show code cell output
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
wandb: Currently logged in as: entelecheia. Use `wandb login --relogin` to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071028-3bx0elnp
Finishing last run (ID:3bx0elnp) before initializing another...
Waiting for W&B process to finish... (success).
Run history:
Training loss | █▇▆▅▃▂▁▁▄ |
acc | ▁█ |
eval_loss | █▁ |
global_step | ▁▂▃▄▄▄▅▆▇██ |
lr | █▇▆▅▄▄▃▂▁ |
mcc | ▁█ |
train_loss | █▁ |
Run summary:
Training loss | 1.03131 |
acc | 0.76427 |
eval_loss | 0.80164 |
global_step | 456 |
lr | 0.0 |
mcc | 0.72096 |
train_loss | 0.71639 |
Synced scarlet-wind-89: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/3bx0elnp
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071028-3bx0elnp/logs
Successfully finished last run (ID:3bx0elnp). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071142-1r3nk1yw
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:1r3nk1yw) before initializing another...
Waiting for W&B process to finish... (success).
Synced fiery-plasma-90: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/1r3nk1yw
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071142-1r3nk1yw/logs
Successfully finished last run (ID:1r3nk1yw). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071214-kpbs288v
Finishing last run (ID:kpbs288v) before initializing another...
Waiting for W&B process to finish... (success).
Run history:
Training loss | █▆▄▅▄▂▂▄▁ |
acc | ▁█ |
eval_loss | █▁ |
global_step | ▁▂▃▄▄▄▅▆▇██ |
lr | █▇▆▅▄▄▃▂▁ |
mcc | ▁█ |
train_loss | ▁█ |
Run summary:
Training loss | 0.47778 |
acc | 0.77445 |
eval_loss | 0.7631 |
global_step | 456 |
lr | 0.0 |
mcc | 0.73189 |
train_loss | 0.85427 |
Synced sweet-wave-91: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/kpbs288v
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071214-kpbs288v/logs
Successfully finished last run (ID:kpbs288v). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071333-w3krkvu5
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:w3krkvu5) before initializing another...
Waiting for W&B process to finish... (success).
Synced icy-serenity-92: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/w3krkvu5
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071333-w3krkvu5/logs
Successfully finished last run (ID:w3krkvu5). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071403-2vv0hewt
Finishing last run (ID:2vv0hewt) before initializing another...
Waiting for W&B process to finish... (success).
Run history:
Training loss | █▅▅▂▃▂▃▁▁ |
acc | ▁█ |
eval_loss | █▁ |
global_step | ▁▂▃▄▄▄▅▆▇██ |
lr | █▇▆▅▄▄▃▂▁ |
mcc | ▁█ |
train_loss | ▁█ |
Run summary:
Training loss | 0.67537 |
acc | 0.76936 |
eval_loss | 0.79864 |
global_step | 456 |
lr | 0.0 |
mcc | 0.72731 |
train_loss | 0.44434 |
Synced northern-dream-93: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/2vv0hewt
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071403-2vv0hewt/logs
Successfully finished last run (ID:2vv0hewt). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071523-35lxag5u
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:35lxag5u) before initializing another...
Waiting for W&B process to finish... (success).
Synced iconic-sea-94: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/35lxag5u
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071523-35lxag5u/logs
Successfully finished last run (ID:35lxag5u). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071554-2e7jflgg
Finishing last run (ID:2e7jflgg) before initializing another...
Waiting for W&B process to finish... (success).
Run history:
Training loss | █▆▆▃▃▃▂▁▃ |
acc | ▁█ |
eval_loss | █▁ |
global_step | ▁▂▃▄▄▄▅▆▇██ |
lr | █▇▆▅▄▄▃▂▁ |
mcc | ▁█ |
train_loss | █▁ |
Run summary:
Training loss | 1.06689 |
acc | 0.77671 |
eval_loss | 0.78679 |
global_step | 456 |
lr | 0.0 |
mcc | 0.73375 |
train_loss | 0.55624 |
Synced winter-haze-95: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/2e7jflgg
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071554-2e7jflgg/logs
Successfully finished last run (ID:2e7jflgg). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071712-a7mqtfdv
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:a7mqtfdv) before initializing another...
Waiting for W&B process to finish... (success).
Synced trim-shadow-96: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/a7mqtfdv
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071712-a7mqtfdv/logs
Successfully finished last run (ID:a7mqtfdv). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071742-1sxdb0fs
Finishing last run (ID:1sxdb0fs) before initializing another...
Waiting for W&B process to finish... (success).
Run history:
Training loss | █▅▆▅▂▁▃▁▂ |
acc | ▁█ |
eval_loss | █▁ |
global_step | ▁▂▃▄▄▄▅▆▇██ |
lr | █▇▆▅▄▄▃▂▁ |
mcc | ▁█ |
train_loss | ▁█ |
Run summary:
Training loss | 0.89486 |
acc | 0.7671 |
eval_loss | 0.7825 |
global_step | 456 |
lr | 0.0 |
mcc | 0.72317 |
train_loss | 0.5121 |
Synced super-plasma-97: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/1sxdb0fs
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at:
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071742-1sxdb0fs/logs
Successfully finished last run (ID:1sxdb0fs). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in
/workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071900-2obg8t98
Use rubrix to find potential label errors#
rb_cfg = eKonf.compose('model/rubrix')
rb_cfg.auto.init = True
rb = eKonf.instantiate(rb_cfg)
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
rb.get_workspace()
'esgml'
cv_preds = eKonf.load_data("esg_topics_cv_preds.parquet", data_dir)
records = rb.create_records_from_cv_preds(
cv_preds,
)
records[0]
TextClassificationRecord(text='os 전쟁 구글 , 애플 , ms , 인텔 삼성전자 등 각각 의 iot 전용 os 강화\n업체 들 안드로이드 os 견제 해 아직 채택 안함 센서 고성장 전망 mems ( 반도체 센서 ) 시장 고성장 예상\n다만 , 국내 기술 은 매우 미흡\n인공지능 이 인공지능 이 3 세대 세대 deepdeep learninglearning 의 초기 초기 국면 으로 국면 으로 들어가면서들어가면서 혁신 이 혁신 이 일어나기일어나기 시작 인공지능 연구 가 3 세대 deep learning 의 초기 국면 으로 들어가면서 혁신 이 일어나기 시작 함', inputs={'text': 'os 전쟁 구글 , 애플 , ms , 인텔 삼성전자 등 각각 의 iot 전용 os 강화\n업체 들 안드로이드 os 견제 해 아직 채택 안함 센서 고성장 전망 mems ( 반도체 센서 ) 시장 고성장 예상\n다만 , 국내 기술 은 매우 미흡\n인공지능 이 인공지능 이 3 세대 세대 deepdeep learninglearning 의 초기 초기 국면 으로 국면 으로 들어가면서들어가면서 혁신 이 혁신 이 일어나기일어나기 시작 인공지능 연구 가 3 세대 deep learning 의 초기 국면 으로 들어가면서 혁신 이 일어나기 시작 함'}, prediction=[('E-기후변화', 0.0031880621893694222), ('E-환경영향', 0.005058959626738143), ('E-환경혁신', 0.010744794691714028), ('G-기업윤리/불공정/소송', 0.01250313787700796), ('G-지배구조', 0.018917387394747708), ('S-고용', 0.00931412617366345), ('S-기업(공급망)동반성장/상생', 0.0068541975270435635), ('S-사회공헌', 0.0037565036432219917), ('S-소비자', 0.9259529941013552), ('S-재해/안전관리', 0.003709836775138507)], prediction_agent=None, annotation='S-소비자', annotation_agent=None, multi_label=False, explanation=None, id=None, metadata={'id': 411, 'split': 'train'}, status='Validated', event_timestamp=None, metrics=None, search_keywords=None)
# get records with potential label errors
records_with_label_error = rb.find_label_errors(records)
records_with_label_error[0]
TextClassificationRecord(text='최근 이동통신 시장 은 lte 서비스 도입 으로 큰 변화 를 맞고 있음\nlte 는 3 g 이동통신망 으로 널리 채택 된 wcdma 방식 에서 발전 된 규격 으로 전세계 이동통신사 의 80 % 이상 이 lte 를 채택 하고 있음\n와이브로 는 한국 이 lte 와 경쟁 하고자 만든 차세대 국책사업 중 하나였지만 , 초기 시장 창출 실패 와 이동통신사업자 들 의 견제 , 정부 의 정책 부재 등으로 사업화 에 난항\n국내 lte 서비스 는 상용화 1 년 만에 1 , 000 만명을 넘어선 반면 , 와이브로 는 6 년이 지난 현재 까지도 사용자 가 100 만명 수준 에 불과한 실정\n이러한 통신시장 의 구조적 인 변화 는 와이브로 에 편중 된 동사 에 위기 로 작용 할 수 있음', inputs={'text': '최근 이동통신 시장 은 lte 서비스 도입 으로 큰 변화 를 맞고 있음\nlte 는 3 g 이동통신망 으로 널리 채택 된 wcdma 방식 에서 발전 된 규격 으로 전세계 이동통신사 의 80 % 이상 이 lte 를 채택 하고 있음\n와이브로 는 한국 이 lte 와 경쟁 하고자 만든 차세대 국책사업 중 하나였지만 , 초기 시장 창출 실패 와 이동통신사업자 들 의 견제 , 정부 의 정책 부재 등으로 사업화 에 난항\n국내 lte 서비스 는 상용화 1 년 만에 1 , 000 만명을 넘어선 반면 , 와이브로 는 6 년이 지난 현재 까지도 사용자 가 100 만명 수준 에 불과한 실정\n이러한 통신시장 의 구조적 인 변화 는 와이브로 에 편중 된 동사 에 위기 로 작용 할 수 있음'}, prediction=[('E-기후변화', 0.0030561790387218834), ('E-환경영향', 0.004940841001211258), ('E-환경혁신', 0.011977991797623686), ('G-기업윤리/불공정/소송', 0.00884969377797024), ('G-지배구조', 0.011248100661326789), ('S-고용', 0.006743854005870231), ('S-기업(공급망)동반성장/상생', 0.006392374888576145), ('S-사회공헌', 0.00277139162106223), ('S-소비자', 0.9404210229503303), ('S-재해/안전관리', 0.0035985502573076407)], prediction_agent=None, annotation='E-환경영향', annotation_agent=None, multi_label=False, explanation=None, id=None, metadata={'id': 851, 'split': 'dev', 'label_error_candidate': 0}, status='Validated', event_timestamp=None, metrics=None, search_keywords=None)
len(records_with_label_error)
1408
# uncover label errors in the Rubrix web app
rb.log(records_with_label_error, "esg_topic_label_errors")
1408 records logged to http://ekorpkit-book:6900/datasets/esgml/esg_topic_label_errors
Saving the re-labelled dataset#
relabelled_dataset = rb.load("esg_topic_label_errors")
for split, data in ds._splits.items():
data = rb.update_label_errors(data, relabelled_dataset, split=split)
ds._splits[split] = data
len(ds.data[ds.data.labels != ds.data.original_labels])
27
ds.save_as("esg_topics_improved")
ds_cfg.name = "esg_topics_improved"
ds = eKonf.instantiate(ds_cfg)
labels = list(ds.splits['train'].labels.unique())
print(labels)
INFO:ekorpkit.base:Using batcher with minibatch size: 39
INFO:ekorpkit.base:Using batcher with minibatch size: 5
INFO:ekorpkit.base:Using batcher with minibatch size: 5
['S-기업(공급망)동반성장/상생', 'G-지배구조', 'G-기업윤리/불공정/소송', 'S-소비자', 'E-환경혁신', 'S-사회공헌', 'S-고용', 'E-환경영향', 'E-기후변화', 'S-재해/안전관리']
ds.data_dir
'/workspace/data/datasets/simple/esg_topics_improved'