Skip to content

Commit

Permalink
Merge pull request #194 from hwiorn/add-more-corpus-lmdata-task
Browse files Browse the repository at this point in the history
Add AIHub and Modu Corpus (Especially local installed corpus) into lmdata task
  • Loading branch information
lovit authored Jan 27, 2021
2 parents 2571bb0 + f8784ea commit 2a2be9d
Show file tree
Hide file tree
Showing 34 changed files with 331 additions and 83 deletions.
64 changes: 57 additions & 7 deletions Korpora/korpus_aihub_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,46 +57,98 @@
class AIHubTranslationKorpus(Korpus):
def __init__(self, root_dir=None, force_download=False, prefix='', name='AIHub_translation'):
super().__init__(description, license)
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir, str) and os.path.isdir(root_dir):
root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix)
paths = find_corpus_paths(root_dir)
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, prefix, find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = SentencePairKorpusData(
f'{name}.train',
*load_aihub_translation(paths, name)
)

@classmethod
def get_corpus_path(cls, root_dir=None, prefix='', finder=None):
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir, str) and os.path.isdir(root_dir):
root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix)
paths = []
if callable(finder):
paths = finder(root_dir)

return paths

@classmethod
def exists(cls, root_dir=None):
return all([
AIHubSpokenTranslationKorpus.exists(root_dir=root_dir),
AIHubConversationTranslationKorpus.exists(root_dir=root_dir),
AIHubNewsTranslationKorpus.exists(root_dir=root_dir),
AIHubKoreanCultureTranslationKorpus.exists(root_dir=root_dir),
AIHubDecreeTranslationKorpus.exists(root_dir=root_dir),
AIHubGovernmentWebsiteTranslationKorpus.exists(root_dir=root_dir),
])


class AIHubSpokenTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '1_spoken*', 'AIHub_spoken_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '1_spoken*', find_corpus_paths)
return len(paths) > 0


class AIHubConversationTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '2_conversation*', 'AIHub_conversation_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '2_conversation*', find_corpus_paths)
return len(paths) > 0


class AIHubNewsTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '3_news*', 'AIHub_news_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '3_news*', find_corpus_paths)
return len(paths) > 0


class AIHubKoreanCultureTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '4_korean_culture*', 'AIHub_korean_culture_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '4_korean_culture*', find_corpus_paths)
return len(paths) > 0


class AIHubDecreeTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '5_decree*', 'AIHub_decree_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '5_decree*', find_corpus_paths)
return len(paths) > 0


class AIHubGovernmentWebsiteTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '6_government_website*', 'AIHub_government_website_translation')

@classmethod
def exists(cls, root_dir=None):
paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '6_government_website*', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir, suffix='200226.xlsx'):
def match(path):
Expand All @@ -109,8 +161,6 @@ def match(path):
paths = root_dir

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')
return paths


Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_chatbot_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import csv

from .korpora import Korpus, LabeledSentencePairKorpusData
from .utils import fetch, default_korpora_path
from .utils import fetch, default_korpora_path, check_exists


KOREAN_CHATBOT_FETCH_INFORMATION = [
Expand Down Expand Up @@ -64,6 +64,10 @@ def get_all_pairs(self):
def get_all_labels(self):
return self.train.get_all_labels()

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_chatbot_data', KOREAN_CHATBOT_FETCH_INFORMATION, root_dir=root_dir)


def fetch_chatbot(root_dir, force_download):
for information in KOREAN_CHATBOT_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kcbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import platform

from .korpora import Korpus, KorpusData
from .utils import fetch, load_text, default_korpora_path
from .utils import fetch, load_text, default_korpora_path, check_exists


KCBERT_FETCH_INFORMATION = [
Expand Down Expand Up @@ -67,6 +67,10 @@ def __init__(self, root_dir=None, force_download=False):
dirname = os.path.abspath(f'{root_dir}/kcbert')
self.train = f'KcBERT corpus is downloaded. Open local directory {dirname}'

@classmethod
def exists(cls, root_dir=None):
return check_exists('kcbert', KCBERT_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kcbert(root_dir, force_download):
for info in KCBERT_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_hate_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List

from .korpora import Korpus, KorpusData, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KOREAN_HATE_SPEECH_FETCH_INFORMATION = [
Expand Down Expand Up @@ -153,6 +153,10 @@ def __init__(self, root_dir=None, force_download=False):
self.unlabeled = load_unlabeled(root_dir)
self.test = load_test(root_dir)

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_hate_speech', KOREAN_HATE_SPEECH_FETCH_INFORMATION, root_dir=root_dir)


def load_train(root_dir):
# head : comments, contain_gender_bias, bias, hate
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

from .korpora import Korpus, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_parallel_text
from .utils import fetch, default_korpora_path, load_parallel_text, check_exists


KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -58,6 +58,10 @@ def fetch_and_load(self, mode, root_dir, fetch_info, force_download):
sources, targets = load_parallel_text(source_path, target_path)
return SentencePairKorpusData(dataname, sources, targets)

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_parallel_koen_news', KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korean_parallel_koen_news(root_dir, force_download):
for info in KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korean_petitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List

from .korpora import Korpus, KorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KOREAN_PETITIONS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -230,6 +230,10 @@ def get_all_num_agrees(self):
def get_all_titles(self):
return self.train.titles

@classmethod
def exists(cls, root_dir=None):
return check_exists('korean_petitions', KOREAN_PETITIONS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korean_petitions(root_dir, force_download):
for info in KOREAN_PETITIONS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kornli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

from .korpora import Korpus, LabeledSentencePairKorpusData
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KORNLI_FETCH_INFORMATION = [
Expand Down Expand Up @@ -102,6 +102,10 @@ def get_all_labels(self):
self.xnli_dev.get_all_labels() +
self.xnli_test.get_all_labels())

@classmethod
def exists(cls, root_dir=None):
return check_exists('kornli', KORNLI_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kornli(root_dir, force_download):
for info in KORNLI_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_korsts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import dataclass

from .korpora import Korpus, LabeledSentencePairKorpusData, LabeledSentencePair
from .utils import fetch, default_korpora_path, load_text
from .utils import fetch, default_korpora_path, load_text, check_exists


KORSTS_FETCH_INFORMATION = [
Expand Down Expand Up @@ -130,6 +130,10 @@ def get_all_filenames(self):
def get_all_years(self):
return self.train.get_all_years() + self.dev.get_all_years() + self.test.get_all_years()

@classmethod
def exists(cls, root_dir=None):
return check_exists('korsts', KORSTS_FETCH_INFORMATION, root_dir=root_dir)


def fetch_korsts(root_dir, force_download):
for info in KORSTS_FETCH_INFORMATION:
Expand Down
6 changes: 5 additions & 1 deletion Korpora/korpus_kowiki.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from .korpora import Korpus, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_wikitext
from .utils import fetch, default_korpora_path, load_wikitext, check_exists


KOWIKI_FETCH_INFORMATION = [
Expand Down Expand Up @@ -82,6 +82,10 @@ def split_title_text(wikitext):
# swap position
return texts, titles

@classmethod
def exists(cls, root_dir=None):
return check_exists('kowikitext', KOWIKI_FETCH_INFORMATION, root_dir=root_dir)


def fetch_kowikitext(root_dir, force_download):
for information in KOWIKI_FETCH_INFORMATION:
Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_messenger.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
class ModuMessengerKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_MESSENGER')
alternative_root_dir = os.path.join(root_dir, 'NIKL_MESSENGER')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_메신저_말뭉치(conversation).train', load_modu_messenger(paths))

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths)
return len(paths) > 0


@dataclass
class Utterance:
Expand Down Expand Up @@ -59,8 +62,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_morpheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
class ModuMorphemeKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_MP')
alternative_root_dir = os.path.join(root_dir, 'NIKL_MP')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_형태분석_말뭉치.train', load_modu_morpheme(paths))
self.tagmap = {
'JKS': '주격조사',
Expand Down Expand Up @@ -72,6 +70,11 @@ def __init__(self, root_dir=None, force_download=False):
'NAP': '이름과 같은 개인정보'
}

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir_or_paths):
prefix_pattern = re.compile('[NS]XMP')
Expand All @@ -86,8 +89,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
17 changes: 9 additions & 8 deletions Korpora/korpus_modu_ne.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
class ModuNEKorpus(ModuKorpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__()
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'NIKL_NE')
alternative_root_dir = os.path.join(root_dir, 'NIKL_NE')
if os.path.exists(alternative_root_dir):
root_dir = alternative_root_dir
paths = find_corpus_paths(root_dir)
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths)
if not paths:
raise ValueError('Not found corpus files. Check `root_dir`')

self.train = KorpusData('모두의_개체명_말뭉치.train', load_modu_ne(paths))
self.tagmap = {
'PS': 'PERSON',
Expand All @@ -39,6 +37,11 @@ def __init__(self, root_dir=None, force_download=False):
'TM': 'TERM'
}

@classmethod
def exists(cls, root_dir=None):
paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths)
return len(paths) > 0


def find_corpus_paths(root_dir_or_paths):
prefix_pattern = re.compile('[NS]XNE')
Expand All @@ -53,8 +56,6 @@ def match(path):
paths = root_dir_or_paths

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
return paths


Expand Down
Loading

0 comments on commit 2a2be9d

Please sign in to comment.