Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AIHub and Modu Corpus (Especially local installed corpus) into lmdata task #194

Merged
merged 2 commits into from
Jan 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 57 additions & 7 deletions Korpora/korpus_aihub_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,46 +57,98 @@
class AIHubTranslationKorpus(Korpus):
def __init__(self, root_dir=None, force_download=False, prefix='', name='AIHub_translation'):
super().__init__(description, license)
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir, str) and os.path.isdir(root_dir):
root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix)
paths = find_corpus_paths(root_dir)
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, prefix, find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = SentencePairKorpusData(
f'{name}.train',
*load_aihub_translation(paths, name)
)

@classmethod
def get_corpus_path(cls, root_dir=None, prefix='', finder=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_corpus_path 라는 classmethod 를 두는 것은 정말 좋은 방법이라고 생각합니다. AIHub translation corpus 외에도 압축파일로 다운로드를 하는 파일들의 경우, 압축 해제된 데이터 파일들의 path 를 관리하기가 어려웠는데, 이 classmethod 는 다른 클래스에서도 지원되면 좋을 기능 같아서 따로 #197 이슈로 넣어뒀습니다.

이 작업은 수정 내용이 많을 듯 하니 이 PR 이후에 다른 PR 로 작업하면 좋을 듯 합니다.

if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir, str) and os.path.isdir(root_dir):
root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix)
paths = []
if callable(finder):
paths = finder(root_dir)

return paths

@classmethod
def exists(cls, root_dir=None):
return all([
AIHubSpokenTranslationKorpus.exists(root_dir=root_dir),
AIHubConversationTranslationKorpus.exists(root_dir=root_dir),
AIHubNewsTranslationKorpus.exists(root_dir=root_dir),
AIHubKoreanCultureTranslationKorpus.exists(root_dir=root_dir),
AIHubDecreeTranslationKorpus.exists(root_dir=root_dir),
AIHubGovernmentWebsiteTranslationKorpus.exists(root_dir=root_dir),
])


class AIHubSpokenTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '1_spoken*', 'AIHub_spoken_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '1_spoken*', find_corpus_paths)
return len(paths) > 0


class AIHubConversationTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '2_conversation*', 'AIHub_conversation_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '2_conversation*', find_corpus_paths)
return len(paths) > 0


class AIHubNewsTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '3_news*', 'AIHub_news_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '3_news*', find_corpus_paths)
return len(paths) > 0


class AIHubKoreanCultureTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '4_korean_culture*', 'AIHub_korean_culture_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '4_korean_culture*', find_corpus_paths)
return len(paths) > 0


class AIHubDecreeTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '5_decree*', 'AIHub_decree_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '5_decree*', find_corpus_paths)
return len(paths) > 0


class AIHubGovernmentWebsiteTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '6_government_website*', 'AIHub_government_website_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '6_government_website*', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir, suffix='200226.xlsx'):
def match(path):
Expand All @@ -109,8 +161,6 @@ def match(path):
paths = root_dir

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')
return paths


Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_chatbot_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import csv

from .korpora import Korpus, LabeledSentencePairKorpusData
from .utils import fetch, default_korpora_path
from .utils import fetch, default_korpora_path, check_exists


KOREAN_CHATBOT_FETCH_INFORMATION = [
Expand Down Expand Up @@ -64,6 +64,10 @@ def get_all_pairs(self):
def get_all_labels(self):
return self.train.get_all_labels()

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_chatbot_data', KOREAN_CHATBOT_FETCH_INFORMATION, root_dir=root_dir)


def fetch_chatbot(root_dir, force_download):
for information in KOREAN_CHATBOT_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kcbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import platform

from .korpora import Korpus, KorpusData
from .utils import fetch, load_text, default_korpora_path
from .utils import fetch, load_text, default_korpora_path, check_exists


KCBERT_FETCH_INFORMATION = [
Expand Down Expand Up @@ -67,6 +67,10 @@ def __init__(self, root_dir=None, force_download=False):
dirname = os.path.abspath(f'{root_dir}/kcbert')
self.train = f'KcBERT corpus is downloaded. Open local directory {dirname}'

@classmethod
def exists(cls, root_dir=None):
return check_exists('kcbert', KCBERT_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kcbert(root_dir, force_download):
for info in KCBERT_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_hate_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List

from .korpora import Korpus, KorpusData, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KOREAN_HATE_SPEECH_FETCH_INFORMATION = [
Expand Down Expand Up @@ -153,6 +153,10 @@ def __init__(self, root_dir=None, force_download=False):
self.unlabeled = load_unlabeled(root_dir)
self.test = load_test(root_dir)

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_hate_speech', KOREAN_HATE_SPEECH_FETCH_INFORMATION, root_dir=root_dir)


def load_train(root_dir):
# head : comments, contain_gender_bias, bias, hate
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

from .korpora import Korpus, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_parallel_text
from .utils import fetch, default_korpora_path, load_parallel_text, check_exists


KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -58,6 +58,10 @@ def fetch_and_load(self, mode, root_dir, fetch_info, force_download):
sources, targets = load_parallel_text(source_path, target_path)
return SentencePairKorpusData(dataname, sources, targets)

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_parallel_koen_news', KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korean_parallel_koen_news(root_dir, force_download):
for info in KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_petitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List

from .korpora import Korpus, KorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KOREAN_PETITIONS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -230,6 +230,10 @@ def get_all_num_agrees(self):
def get_all_titles(self):
return self.train.titles

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_petitions', KOREAN_PETITIONS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korean_petitions(root_dir, force_download):
for info in KOREAN_PETITIONS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kornli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

from .korpora import Korpus, LabeledSentencePairKorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KORNLI_FETCH_INFORMATION = [
Expand Down Expand Up @@ -102,6 +102,10 @@ def get_all_labels(self):
self.xnli_dev.get_all_labels() +
self.xnli_test.get_all_labels())

@classmethod
def exists(cls, root_dir=None):
return check_exists('kornli', KORNLI_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kornli(root_dir, force_download):
for info in KORNLI_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korsts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import dataclass

from .korpora import Korpus, LabeledSentencePairKorpusData, LabeledSentencePair
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KORSTS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -130,6 +130,10 @@ def get_all_filenames(self):
def get_all_years(self):
return self.train.get_all_years() + self.dev.get_all_years() + self.test.get_all_years()

@classmethod
def exists(cls, root_dir=None):
return check_exists('korsts', KORSTS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korsts(root_dir, force_download):
for info in KORSTS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kowiki.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from .korpora import Korpus, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_wikitext
from .utils import fetch, default_korpora_path, load_wikitext, check_exists


KOWIKI_FETCH_INFORMATION = [
Expand Down Expand Up @@ -82,6 +82,10 @@ def split_title_text(wikitext):
# swap position
return texts, titles

@classmethod
def exists(cls, root_dir=None):
return check_exists('kowikitext', KOWIKI_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kowikitext(root_dir, force_download):
for information in KOWIKI_FETCH_INFORMATION:
Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_messenger.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
class ModuMessengerKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_MESSENGER')
alternative_root_dir = os.path.join(root_dir, 'NIKL_MESSENGER')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_메신저_말뭉치(conversation).train', load_modu_messenger(paths))

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths)
return len(paths) > 0


@dataclass
class Utterance:
Expand Down Expand Up @@ -59,8 +62,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_morpheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
class ModuMorphemeKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_MP')
alternative_root_dir = os.path.join(root_dir, 'NIKL_MP')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_형태분석_말뭉치.train', load_modu_morpheme(paths))
self.tagmap = {
'JKS': '주격조사',
Expand Down Expand Up @@ -72,6 +70,11 @@ def __init__(self, root_dir=None, force_download=False):
'NAP': '이름과 같은 개인정보'
}

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir_or_paths):
prefix_pattern = re.compile('[NS]XMP')
Expand All @@ -86,8 +89,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_ne.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
class ModuNEKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_NE')
alternative_root_dir = os.path.join(root_dir, 'NIKL_NE')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_개체명_말뭉치.train', load_modu_ne(paths))
self.tagmap = {
'PS': 'PERSON',
Expand All @@ -39,6 +37,11 @@ def __init__(self, root_dir=None, force_download=False):
'TM': 'TERM'
}

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir_or_paths):
prefix_pattern = re.compile('[NS]XNE')
Expand All @@ -53,8 +56,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
Loading