diff --git a/Korpora/korpus_aihub_translation.py b/Korpora/korpus_aihub_translation.py index 394f40b..b187afb 100644 --- a/Korpora/korpus_aihub_translation.py +++ b/Korpora/korpus_aihub_translation.py @@ -57,46 +57,98 @@ class AIHubTranslationKorpus(Korpus): def __init__(self, root_dir=None, force_download=False, prefix='', name='AIHub_translation'): super().__init__(description, license) - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix) - elif isinstance(root_dir, str) and os.path.isdir(root_dir): - root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix) - paths = find_corpus_paths(root_dir) + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, prefix, find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = SentencePairKorpusData( f'{name}.train', *load_aihub_translation(paths, name) ) + @classmethod + def get_corpus_path(cls, root_dir=None, prefix='', finder=None): + if root_dir is None: + root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix) + elif isinstance(root_dir, str) and os.path.isdir(root_dir): + root_dir = os.path.join(root_dir, 'AIHub_Translation', prefix) + paths = [] + if callable(finder): + paths = finder(root_dir) + + return paths + + @classmethod + def exists(cls, root_dir=None): + return all([ + AIHubSpokenTranslationKorpus.exists(root_dir=root_dir), + AIHubConversationTranslationKorpus.exists(root_dir=root_dir), + AIHubNewsTranslationKorpus.exists(root_dir=root_dir), + AIHubKoreanCultureTranslationKorpus.exists(root_dir=root_dir), + AIHubDecreeTranslationKorpus.exists(root_dir=root_dir), + AIHubGovernmentWebsiteTranslationKorpus.exists(root_dir=root_dir), + ]) + class AIHubSpokenTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '1_spoken*', 'AIHub_spoken_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '1_spoken*', find_corpus_paths) + return len(paths) > 0 + class AIHubConversationTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '2_conversation*', 'AIHub_conversation_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '2_conversation*', find_corpus_paths) + return len(paths) > 0 + class AIHubNewsTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '3_news*', 'AIHub_news_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '3_news*', find_corpus_paths) + return len(paths) > 0 + class AIHubKoreanCultureTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '4_korean_culture*', 'AIHub_korean_culture_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '4_korean_culture*', find_corpus_paths) + return len(paths) > 0 + class AIHubDecreeTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '5_decree*', 'AIHub_decree_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '5_decree*', find_corpus_paths) + return len(paths) > 0 + class AIHubGovernmentWebsiteTranslationKorpus(AIHubTranslationKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__(root_dir, force_download, '6_government_website*', 'AIHub_government_website_translation') + @classmethod + def exists(cls, root_dir=None): + paths = AIHubTranslationKorpus.get_corpus_path(root_dir, '6_government_website*', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir, suffix='200226.xlsx'): def match(path): @@ -109,8 +161,6 @@ def match(path): paths = root_dir paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir`') return paths diff --git a/Korpora/korpus_chatbot_data.py b/Korpora/korpus_chatbot_data.py index d43b51b..e385741 100644 --- a/Korpora/korpus_chatbot_data.py +++ b/Korpora/korpus_chatbot_data.py @@ -2,7 +2,7 @@ import csv from .korpora import Korpus, LabeledSentencePairKorpusData -from .utils import fetch, default_korpora_path +from .utils import fetch, default_korpora_path, check_exists KOREAN_CHATBOT_FETCH_INFORMATION = [ @@ -64,6 +64,10 @@ def get_all_pairs(self): def get_all_labels(self): return self.train.get_all_labels() + @classmethod + def exists(cls, root_dir=None): + return check_exists('korean_chatbot_data', KOREAN_CHATBOT_FETCH_INFORMATION, root_dir=root_dir) + def fetch_chatbot(root_dir, force_download): for information in KOREAN_CHATBOT_FETCH_INFORMATION: diff --git a/Korpora/korpus_kcbert.py b/Korpora/korpus_kcbert.py index 43cf1b3..401d768 100644 --- a/Korpora/korpus_kcbert.py +++ b/Korpora/korpus_kcbert.py @@ -2,7 +2,7 @@ import platform from .korpora import Korpus, KorpusData -from .utils import fetch, load_text, default_korpora_path +from .utils import fetch, load_text, default_korpora_path, check_exists KCBERT_FETCH_INFORMATION = [ @@ -67,6 +67,10 @@ def __init__(self, root_dir=None, force_download=False): dirname = os.path.abspath(f'{root_dir}/kcbert') self.train = f'KcBERT corpus is downloaded. Open local directory {dirname}' + @classmethod + def exists(cls, root_dir=None): + return check_exists('kcbert', KCBERT_FETCH_INFORMATION, root_dir=root_dir) + def fetch_kcbert(root_dir, force_download): for info in KCBERT_FETCH_INFORMATION: diff --git a/Korpora/korpus_korean_hate_speech.py b/Korpora/korpus_korean_hate_speech.py index 5b9181c..8e0ec8b 100644 --- a/Korpora/korpus_korean_hate_speech.py +++ b/Korpora/korpus_korean_hate_speech.py @@ -3,7 +3,7 @@ from typing import List from .korpora import Korpus, KorpusData, SentencePairKorpusData -from .utils import fetch, default_korpora_path, load_text +from .utils import fetch, default_korpora_path, load_text, check_exists KOREAN_HATE_SPEECH_FETCH_INFORMATION = [ @@ -153,6 +153,10 @@ def __init__(self, root_dir=None, force_download=False): self.unlabeled = load_unlabeled(root_dir) self.test = load_test(root_dir) + @classmethod + def exists(cls, root_dir=None): + return check_exists('korean_hate_speech', KOREAN_HATE_SPEECH_FETCH_INFORMATION, root_dir=root_dir) + def load_train(root_dir): # head : comments, contain_gender_bias, bias, hate diff --git a/Korpora/korpus_korean_parallel.py b/Korpora/korpus_korean_parallel.py index cfa6a7e..db8209e 100644 --- a/Korpora/korpus_korean_parallel.py +++ b/Korpora/korpus_korean_parallel.py @@ -2,7 +2,7 @@ from typing import List from .korpora import Korpus, SentencePairKorpusData -from .utils import fetch, default_korpora_path, load_parallel_text +from .utils import fetch, default_korpora_path, load_parallel_text, check_exists KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION = [ @@ -58,6 +58,10 @@ def fetch_and_load(self, mode, root_dir, fetch_info, force_download): sources, targets = load_parallel_text(source_path, target_path) return SentencePairKorpusData(dataname, sources, targets) + @classmethod + def exists(cls, root_dir=None): + return check_exists('korean_parallel_koen_news', KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION, root_dir=root_dir) + def fetch_korean_parallel_koen_news(root_dir, force_download): for info in KOREAN_PARALLEL_KOEN_NEWS_FETCH_INFORMATION: diff --git a/Korpora/korpus_korean_petitions.py b/Korpora/korpus_korean_petitions.py index 0b65ed7..017e85e 100644 --- a/Korpora/korpus_korean_petitions.py +++ b/Korpora/korpus_korean_petitions.py @@ -4,7 +4,7 @@ from typing import List from .korpora import Korpus, KorpusData -from .utils import fetch, default_korpora_path, load_text +from .utils import fetch, default_korpora_path, load_text, check_exists KOREAN_PETITIONS_FETCH_INFORMATION = [ @@ -230,6 +230,10 @@ def get_all_num_agrees(self): def get_all_titles(self): return self.train.titles + @classmethod + def exists(cls, root_dir=None): + return check_exists('korean_petitions', KOREAN_PETITIONS_FETCH_INFORMATION, root_dir=root_dir) + def fetch_korean_petitions(root_dir, force_download): for info in KOREAN_PETITIONS_FETCH_INFORMATION: diff --git a/Korpora/korpus_kornli.py b/Korpora/korpus_kornli.py index 73dfd78..5288d49 100644 --- a/Korpora/korpus_kornli.py +++ b/Korpora/korpus_kornli.py @@ -2,7 +2,7 @@ from typing import List from .korpora import Korpus, LabeledSentencePairKorpusData -from .utils import fetch, default_korpora_path, load_text +from .utils import fetch, default_korpora_path, load_text, check_exists KORNLI_FETCH_INFORMATION = [ @@ -102,6 +102,10 @@ def get_all_labels(self): self.xnli_dev.get_all_labels() + self.xnli_test.get_all_labels()) + @classmethod + def exists(cls, root_dir=None): + return check_exists('kornli', KORNLI_FETCH_INFORMATION, root_dir=root_dir) + def fetch_kornli(root_dir, force_download): for info in KORNLI_FETCH_INFORMATION: diff --git a/Korpora/korpus_korsts.py b/Korpora/korpus_korsts.py index a02c587..d077083 100644 --- a/Korpora/korpus_korsts.py +++ b/Korpora/korpus_korsts.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from .korpora import Korpus, LabeledSentencePairKorpusData, LabeledSentencePair -from .utils import fetch, default_korpora_path, load_text +from .utils import fetch, default_korpora_path, load_text, check_exists KORSTS_FETCH_INFORMATION = [ @@ -130,6 +130,10 @@ def get_all_filenames(self): def get_all_years(self): return self.train.get_all_years() + self.dev.get_all_years() + self.test.get_all_years() + @classmethod + def exists(cls, root_dir=None): + return check_exists('korsts', KORSTS_FETCH_INFORMATION, root_dir=root_dir) + def fetch_korsts(root_dir, force_download): for info in KORSTS_FETCH_INFORMATION: diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py index 78bb31a..d93bbd2 100644 --- a/Korpora/korpus_kowiki.py +++ b/Korpora/korpus_kowiki.py @@ -1,6 +1,6 @@ import os from .korpora import Korpus, SentencePairKorpusData -from .utils import fetch, default_korpora_path, load_wikitext +from .utils import fetch, default_korpora_path, load_wikitext, check_exists KOWIKI_FETCH_INFORMATION = [ @@ -82,6 +82,10 @@ def split_title_text(wikitext): # swap position return texts, titles + @classmethod + def exists(cls, root_dir=None): + return check_exists('kowikitext', KOWIKI_FETCH_INFORMATION, root_dir=root_dir) + def fetch_kowikitext(root_dir, force_download): for information in KOWIKI_FETCH_INFORMATION: diff --git a/Korpora/korpus_modu_messenger.py b/Korpora/korpus_modu_messenger.py index 69cee45..688b6dc 100644 --- a/Korpora/korpus_modu_messenger.py +++ b/Korpora/korpus_modu_messenger.py @@ -14,14 +14,17 @@ class ModuMessengerKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_MESSENGER') - alternative_root_dir = os.path.join(root_dir, 'NIKL_MESSENGER') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_메신저_말뭉치(conversation).train', load_modu_messenger(paths)) + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MESSENGER', find_corpus_paths) + return len(paths) > 0 + @dataclass class Utterance: @@ -59,8 +62,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_morpheme.py b/Korpora/korpus_modu_morpheme.py index a7be8d6..830ab1b 100644 --- a/Korpora/korpus_modu_morpheme.py +++ b/Korpora/korpus_modu_morpheme.py @@ -14,12 +14,10 @@ class ModuMorphemeKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_MP') - alternative_root_dir = os.path.join(root_dir, 'NIKL_MP') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_형태분석_말뭉치.train', load_modu_morpheme(paths)) self.tagmap = { 'JKS': '주격조사', @@ -72,6 +70,11 @@ def __init__(self, root_dir=None, force_download=False): 'NAP': '이름과 같은 개인정보' } + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_MP', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir_or_paths): prefix_pattern = re.compile('[NS]XMP') @@ -86,8 +89,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_ne.py b/Korpora/korpus_modu_ne.py index b2203f8..9332697 100644 --- a/Korpora/korpus_modu_ne.py +++ b/Korpora/korpus_modu_ne.py @@ -14,12 +14,10 @@ class ModuNEKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_NE') - alternative_root_dir = os.path.join(root_dir, 'NIKL_NE') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_개체명_말뭉치.train', load_modu_ne(paths)) self.tagmap = { 'PS': 'PERSON', @@ -39,6 +37,11 @@ def __init__(self, root_dir=None, force_download=False): 'TM': 'TERM' } + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NE', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir_or_paths): prefix_pattern = re.compile('[NS]XNE') @@ -53,8 +56,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index 4489f90..92a5796 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -48,16 +48,27 @@ def __init__(self, force_download=False): fetch_modu() super().__init__(description, license) + @classmethod + def get_corpus_path(cls, root_dir=None, prefix='', finder=None): + if root_dir is None: + root_dir = os.path.join(default_korpora_path, prefix) + alternative_root_dir = os.path.join(root_dir, prefix) + if os.path.exists(alternative_root_dir): + root_dir = alternative_root_dir + paths = [] + if callable(finder): + paths = finder(root_dir) + + return paths + class ModuNewsKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False, load_light=True): super().__init__(force_download) - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_NEWSPAPER') - alternative_root_dir = os.path.join(root_dir, 'NIKL_NEWSPAPER') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NEWSPAPER', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + if load_light: self.train = ModuNewsDataLight('모두의_뉴스_말뭉치(light).train', load_modu_news(paths, load_light)) else: @@ -65,6 +76,11 @@ def __init__(self, root_dir=None, force_download=False, load_light=True): self.row_to_documentid = [news.document_id for news in self.train] self.documentid_to_row = {document_id: idx for idx, document_id in enumerate(self.row_to_documentid)} + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_NEWSPAPER', find_corpus_paths) + return len(paths) > 0 + class ModuNewsData(KorpusData): def __init__(self, name, news): @@ -159,8 +175,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_spoken.py b/Korpora/korpus_modu_spoken.py index 0919357..14ac978 100644 --- a/Korpora/korpus_modu_spoken.py +++ b/Korpora/korpus_modu_spoken.py @@ -14,14 +14,17 @@ class ModuSpokenKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_SPOKEN') - alternative_root_dir = os.path.join(root_dir, 'NIKL_SPOKEN') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_SPOKEN', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_구어_말뭉치.train', load_modu_spoken(paths)) + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_SPOKEN', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir_or_paths): prefix_pattern = re.compile('S[ABDE]RW') @@ -36,8 +39,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_web.py b/Korpora/korpus_modu_web.py index 64dabf1..5f03103 100644 --- a/Korpora/korpus_modu_web.py +++ b/Korpora/korpus_modu_web.py @@ -14,14 +14,17 @@ class ModuWebKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_WEB') - alternative_root_dir = os.path.join(root_dir, 'NIKL_WEB') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_WEB', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_웹_말뭉치.train', load_modu_web(paths)) + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_WEB', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir_or_paths): prefix_pattern = re.compile('E[BPSR]RW') @@ -36,8 +39,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_modu_written.py b/Korpora/korpus_modu_written.py index 409f98e..350eedd 100644 --- a/Korpora/korpus_modu_written.py +++ b/Korpora/korpus_modu_written.py @@ -14,14 +14,17 @@ class ModuWrittenKorpus(ModuKorpus): def __init__(self, root_dir=None, force_download=False): super().__init__() - if root_dir is None: - root_dir = os.path.join(default_korpora_path, 'NIKL_WRITTEN') - alternative_root_dir = os.path.join(root_dir, 'NIKL_WRITTEN') - if os.path.exists(alternative_root_dir): - root_dir = alternative_root_dir - paths = find_corpus_paths(root_dir) + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_WRITTEN', find_corpus_paths) + if not paths: + raise ValueError('Not found corpus files. Check `root_dir`') + self.train = KorpusData('모두의_문어_말뭉치.train', load_modu_written(paths)) + @classmethod + def exists(cls, root_dir=None): + paths = ModuKorpus.get_corpus_path(root_dir, 'NIKL_WRITTEN', find_corpus_paths) + return len(paths) > 0 + def find_corpus_paths(root_dir_or_paths): prefix_pattern = re.compile('W[ABCZ]RW') @@ -36,8 +39,6 @@ def match(path): paths = root_dir_or_paths paths = [path for path in paths if match(path)] - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') return paths diff --git a/Korpora/korpus_namuwiki.py b/Korpora/korpus_namuwiki.py index 31d4868..d43efc9 100644 --- a/Korpora/korpus_namuwiki.py +++ b/Korpora/korpus_namuwiki.py @@ -1,6 +1,6 @@ import os from .korpora import Korpus, SentencePairKorpusData -from .utils import fetch, default_korpora_path, load_wikitext +from .utils import fetch, default_korpora_path, load_wikitext, check_exists NAMUWIKI_FETCH_INFORMATION = [ @@ -82,6 +82,10 @@ def split_title_text(wikitext): # swap position return texts, titles + @classmethod + def exists(cls, root_dir=None): + return check_exists('namuwikitext', NAMUWIKI_FETCH_INFORMATION, root_dir=root_dir) + def fetch_namuwikitext(root_dir, force_download): for information in NAMUWIKI_FETCH_INFORMATION: diff --git a/Korpora/korpus_naverchangwon_ner.py b/Korpora/korpus_naverchangwon_ner.py index 1ab0a7e..3bbe952 100644 --- a/Korpora/korpus_naverchangwon_ner.py +++ b/Korpora/korpus_naverchangwon_ner.py @@ -2,7 +2,7 @@ from typing import List from .korpora import Korpus, WordTagKorpusData -from .utils import fetch, default_korpora_path, load_text +from .utils import fetch, default_korpora_path, load_text, check_exists NAVER_CHANGWON_NER_FETCH_INFORMATION = [ @@ -65,6 +65,10 @@ def get_all_tags(self): def get_all_words_and_tags(self): return [item for item in self.train] + @classmethod + def exists(cls, root_dir=None): + return check_exists('naver_changwon_ner', NAVER_CHANGWON_NER_FETCH_INFORMATION, root_dir=root_dir) + def fetch_naverchangwon_ner(root_dir, force_download): for info in NAVER_CHANGWON_NER_FETCH_INFORMATION: diff --git a/Korpora/korpus_nsmc.py b/Korpora/korpus_nsmc.py index dbf5c6e..4387663 100644 --- a/Korpora/korpus_nsmc.py +++ b/Korpora/korpus_nsmc.py @@ -2,7 +2,7 @@ from typing import List from .korpora import Korpus, LabeledSentenceKorpusData -from .utils import fetch, load_text, default_korpora_path +from .utils import fetch, load_text, default_korpora_path, check_exists NSMC_FETCH_INFORMATION = [ @@ -66,6 +66,10 @@ def get_all_texts(self): def get_all_labels(self): return self.train.labels + self.test.labels + @classmethod + def exists(cls, root_dir=None): + return check_exists('nsmc', NSMC_FETCH_INFORMATION, root_dir=root_dir) + def fetch_nsmc(root_dir, force_download): for info in NSMC_FETCH_INFORMATION: diff --git a/Korpora/korpus_open_subtitles.py b/Korpora/korpus_open_subtitles.py index b27618c..7b6d4b9 100644 --- a/Korpora/korpus_open_subtitles.py +++ b/Korpora/korpus_open_subtitles.py @@ -3,7 +3,7 @@ import urllib from .korpora import Korpus, SentencePairKorpusData -from .utils import fetch, default_korpora_path +from .utils import fetch, default_korpora_path, check_exists OPEN_SUBTITLES_FETCH_INFORMATION = [ @@ -63,6 +63,10 @@ def __init__(self, root_dir=None, force_download=False): def get_all_pairs(self): return self.train.get_all_pairs() + @classmethod + def exists(cls, root_dir=None): + return check_exists('open_subtitles', OPEN_SUBTITLES_FETCH_INFORMATION, root_dir=root_dir) + def parse_xtm(path): pattern = re.compile('[\S ]+') diff --git a/Korpora/korpus_question_pair.py b/Korpora/korpus_question_pair.py index f06262f..0adab27 100644 --- a/Korpora/korpus_question_pair.py +++ b/Korpora/korpus_question_pair.py @@ -2,7 +2,7 @@ import csv from .korpora import Korpus, LabeledSentencePairKorpusData -from .utils import fetch, default_korpora_path +from .utils import fetch, default_korpora_path, check_exists QUESTION_PAIR_FETCH_INFORMATION = [ @@ -73,6 +73,10 @@ def get_all_pairs(self): def get_all_labels(self): return self.train.get_all_labels() + self.test.get_all_labels() + @classmethod + def exists(cls, root_dir=None): + return check_exists('question_pair', QUESTION_PAIR_FETCH_INFORMATION, root_dir=root_dir) + def fetch_questionpair(root_dir, force_download): for info in QUESTION_PAIR_FETCH_INFORMATION: diff --git a/Korpora/loader.py b/Korpora/loader.py index a8bd019..b79b3e7 100644 --- a/Korpora/loader.py +++ b/Korpora/loader.py @@ -73,6 +73,22 @@ def fetch(cls, corpus_name, root_dir=None, force_download=False): def corpus_list(cls): return KORPUS_DESCRIPTION + @classmethod + def exists(cls, corpus_name, root_dir=None, return_by_each_corpus=False): + if (corpus_name == 'all') or (corpus_name[0] == 'all'): + corpus_name = sorted(KORPUS.keys()) + elif isinstance(corpus_name, str): + corpus_name = [corpus_name] + + if root_dir is None: + root_dir = default_korpora_path + + corpora = [KORPUS[name].exists(root_dir=root_dir) for name in corpus_name] + if return_by_each_corpus: + return corpora + + return all(corpora) + KORPUS = { 'kcbert': KcBERTKorpus, diff --git a/Korpora/task_lmdata.py b/Korpora/task_lmdata.py index 3b96c0c..e40f2f2 100644 --- a/Korpora/task_lmdata.py +++ b/Korpora/task_lmdata.py @@ -7,7 +7,6 @@ def create_lmdata(args): - corpus_names = check_corpus(args.corpus) os.makedirs(os.path.abspath(args.output_dir), exist_ok=True) sampling_ratio = args.sampling_ratio @@ -25,6 +24,7 @@ def create_lmdata(args): force_download = args.force_download multilingual = args.multilingual + corpus_names = check_corpus(root_dir, args.corpus) status = [['', name, ' - ', ''] for name in corpus_names] for i_corpus, name in enumerate(corpus_names): @@ -79,7 +79,7 @@ def use(self, text): return np.random.rand() < self.sampling_ratio -def check_corpus(corpus_names): +def check_corpus(root_dir, corpus_names): if (corpus_names == 'all') or (corpus_names[0] == 'all'): corpus_names = list(ITERATE_TEXTS) if isinstance(corpus_names, str): @@ -89,22 +89,25 @@ def check_corpus(corpus_names): if name not in ITERATE_TEXTS: print(f'Not provide {name} corpus. Check the `corpus` argument') continue - available.append(name) + if Korpora.exists(name, root_dir=root_dir): + available.append(name) if not available: - raise ValueError('Not found any proper corpus name. Check the `corpus` argument') + raise ValueError( + 'Not found any proper corpus name. Check the `corpus` argument') return available def print_status(status): max_len = max(max(len(row[3]) for row in status), 9) form = '| {:4} | {:25} | {:10} | {} |' - print('\n\n' + form.format('Done', 'Corpus name', 'Num sents', 'File name' + ' ' * (max_len - 9))) + print('\n\n' + form.format('Done', 'Corpus name', + 'Num sents', 'File name' + ' ' * (max_len - 9))) print(form.format('-' * 4, '-' * 25, '-' * 10, '-' * max_len)) for finish, name, num_sent, filename in status: if not filename: filename = ' ' * max_len else: - filename += ' ' * (max_len -len(filename)) + filename += ' ' * (max_len - len(filename)) print(form.format(finish, name, num_sent, filename)) @@ -135,7 +138,8 @@ def iterate_korean_hate_speech(root_dir, force_download, multilingual=False): def iterate_korean_parallel_koen_news(root_dir, force_download, multilingual): - corpus = Korpora.load('korean_parallel_koen_news', root_dir, force_download) + corpus = Korpora.load('korean_parallel_koen_news', + root_dir, force_download) data = [corpus.train.texts, corpus.dev.texts, corpus.test.texts] if multilingual: data += [corpus.train.pairs, corpus.dev.pairs, corpus.test.pairs] @@ -220,11 +224,64 @@ def iterate_question_pair(root_dir, force_download, multilingual=False): for sent in sents: yield sent + def iterate_open_subtitles(root_dir, force_download, multilingual=False): corpus = Korpora.load('open_subtitles', root_dir, force_download) for sent in corpus.train.texts: yield sent + +def iterate_modu_news(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_news', root_dir, force_download) + for sent in corpus.train.texts: + yield sent + + +def iterate_modu_messenger(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_messenger', root_dir, force_download) + for utt in corpus.train.texts: + for sent in utt.form: + yield sent + + +def iterate_modu_mp(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_mp', root_dir, force_download) + for mp in corpus.train.texts: + yield mp.sentence + + +def iterate_modu_ne(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_ne', root_dir, force_download) + for sent in corpus.train.texts: + yield sent.sentence + + +def iterate_modu_spoken(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_spoken', root_dir, force_download) + for sent in corpus.train.texts: + yield sent + + +def iterate_modu_web(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_web', root_dir, force_download) + for sent in corpus.train.texts: + yield sent + + +def iterate_modu_written(root_dir, force_download, multilingual=False): + corpus = Korpora.load('modu_written', root_dir, force_download) + for sent in corpus.train.texts: + yield sent + + +def iterate_aihub_translation(corpus_name=''): + def fn_iterate_aihub_translation(root_dir, force_download, multilingual=False): + corpus = Korpora.load(corpus_name, root_dir, force_download) + for sent in corpus.train.texts: + yield sent + return fn_iterate_aihub_translation + + ITERATE_TEXTS = { 'kcbert': iterate_kcbert, 'korean_chatbot_data': iterate_korean_chatbot_data, @@ -239,4 +296,17 @@ def iterate_open_subtitles(root_dir, force_download, multilingual=False): 'nsmc': iterate_nsmc, 'question_pair': iterate_question_pair, 'open_subtitles': iterate_open_subtitles, + 'modu_news': iterate_modu_news, + 'modu_messenger': iterate_modu_messenger, + 'modu_mp': iterate_modu_mp, + 'modu_ne': iterate_modu_ne, + 'modu_spoken': iterate_modu_spoken, + 'modu_web': iterate_modu_web, + 'modu_written': iterate_modu_written, + 'aihub_spoken_translation': iterate_aihub_translation('aihub_spoken_translation'), + 'aihub_conversation_translation': iterate_aihub_translation('aihub_conversation_translation'), + 'aihub_news_translation': iterate_aihub_translation('aihub_news_translation'), + 'aihub_korean_culture_translation': iterate_aihub_translation('aihub_korean_culture_translation'), + 'aihub_decree_translation': iterate_aihub_translation('aihub_decree_translation'), + 'aihub_government_website_translation': iterate_aihub_translation('aihub_government_website_translation'), } diff --git a/Korpora/utils.py b/Korpora/utils.py index bce402c..933e4aa 100644 --- a/Korpora/utils.py +++ b/Korpora/utils.py @@ -23,6 +23,23 @@ def check_dir(filepath): os.makedirs(dirname) +def check_exists(corpus_name, informations, root_dir=None): + if root_dir is None: + root_dir = default_korpora_path + + all_is_ok = True + for information in informations: + local_installed_path = os.path.join(root_dir, information['destination']) + if not os.path.exists(local_installed_path): + print(f'Not found {local_installed_path}') + all_is_ok = False + + if not all_is_ok: + print(f'Install corpus using `Korpora.fetch("{corpus_name}")`') + + return all_is_ok + + def load_text(path, num_heads=0, num_samples=-1): lines = [] with open(path, encoding='utf-8') as f: diff --git a/tests/manual.py b/tests/manual.py index e5188bc..eaf0cf5 100644 --- a/tests/manual.py +++ b/tests/manual.py @@ -40,6 +40,13 @@ def fetch_test(args): time.sleep(0.5) +def exists_test(args): + for corpus_name in corpus_list: + result = Korpora.exists(corpus_name, root_dir=args.root_dir) + print(corpus_name, result) + time.sleep(0.1) + + def load_small_test(args): exclusive_load_test = { 'kcbert', 'kowikitext', 'namuwikitext', @@ -81,6 +88,11 @@ def main(): parser_fetch.add_argument('--root_dir', type=str, default=None, help='default is `~/Korpora/`') parser_fetch.set_defaults(func=fetch_test) + # exists + parser_fetch = subparsers.add_parser('exists', help='Check if a `corpus` exists') + parser_fetch.add_argument('--root_dir', type=str, default=None, help='default is `~/Korpora/`') + parser_fetch.set_defaults(func=exists_test) + # load small corpus parser_load_small = subparsers.add_parser('load_small', help='Fetch `corpus` to `root`') parser_load_small.add_argument('--root_dir', type=str, default=None, help='default is `~/Korpora/`') diff --git a/tests/test_korean_chatbot_data.py b/tests/test_korean_chatbot_data.py index 7307b71..3477418 100644 --- a/tests/test_korean_chatbot_data.py +++ b/tests/test_korean_chatbot_data.py @@ -4,6 +4,7 @@ def test_usage(): chatbot_data = Korpora.load('korean_chatbot_data') + assert chatbot_data.exists() assert len(chatbot_data.train) == len(KoreanChatbotKorpus().train) assert len(chatbot_data.train.texts) == 11823 assert len(chatbot_data.train.pairs) == 11823 diff --git a/tests/test_korean_hate_speech.py b/tests/test_korean_hate_speech.py index e330b0d..e414a7e 100644 --- a/tests/test_korean_hate_speech.py +++ b/tests/test_korean_hate_speech.py @@ -4,6 +4,8 @@ def test_usage(): korean_hate_speech = Korpora.load('korean_hate_speech') korean_hate_speech_ = KoreanHateSpeechKorpus() + assert korean_hate_speech.exists() + assert korean_hate_speech_.exists() assert len(korean_hate_speech.unlabeled) == len(korean_hate_speech_.unlabeled) == 2033893 assert len(korean_hate_speech.test) == 974 assert len(korean_hate_speech.dev) == 471 diff --git a/tests/test_korean_parallel.py b/tests/test_korean_parallel.py index aaba2cf..a2bfdf5 100644 --- a/tests/test_korean_parallel.py +++ b/tests/test_korean_parallel.py @@ -4,6 +4,8 @@ def test_usage(): koen_news = Korpora.load('korean_parallel_koen_news') koen_news_ = KoreanParallelKOENNewsKorpus() + assert koen_news.exists() + assert koen_news_.exists() assert len(koen_news.train) == len(koen_news_.train) == 94123 assert len(koen_news.dev) == 1000 assert len(koen_news.test) == 2000 diff --git a/tests/test_korean_petitions.py b/tests/test_korean_petitions.py index b3649eb..ccc243c 100644 --- a/tests/test_korean_petitions.py +++ b/tests/test_korean_petitions.py @@ -4,6 +4,7 @@ def test_usage(): petitions = Korpora.load('korean_petitions') + assert petitions.exists() assert len(petitions.train) == len(KoreanPetitionsKorpus().train) assert len(petitions.train) == 433631 assert len(petitions.train[0].text) == 1491 diff --git a/tests/test_kornli.py b/tests/test_kornli.py index 842f86a..477e99d 100644 --- a/tests/test_kornli.py +++ b/tests/test_kornli.py @@ -3,6 +3,7 @@ def test_usage(): kornli = Korpora.load('kornli') + assert kornli.exists() assert len(kornli.snli_train) == len(KorNLIKorpus().snli_train) kornli.snli_train[0] assert len(kornli.snli_train) == 550152 diff --git a/tests/test_korsts.py b/tests/test_korsts.py index e8ddcdf..1aef53a 100644 --- a/tests/test_korsts.py +++ b/tests/test_korsts.py @@ -3,6 +3,7 @@ def test_usage(): korsts = Korpora.load('korsts') + assert korsts.exists() assert len(korsts.train) == len(KorSTSKorpus().train) korsts.train[0] korsts.dev[0] diff --git a/tests/test_naver_changwon_ner.py b/tests/test_naver_changwon_ner.py index f09e392..f626a9f 100644 --- a/tests/test_naver_changwon_ner.py +++ b/tests/test_naver_changwon_ner.py @@ -3,6 +3,7 @@ def test_usage(): ner = Korpora.load('naver_changwon_ner') + assert ner.exists() assert len(ner.train) == len(NaverChangwonNERKorpus().train) ner.train[0] assert len(ner.train) == 90000 diff --git a/tests/test_nsmc.py b/tests/test_nsmc.py index a1106cf..a800a41 100644 --- a/tests/test_nsmc.py +++ b/tests/test_nsmc.py @@ -3,6 +3,7 @@ def test_usage(): nsmc = Korpora.load('nsmc') + assert nsmc.exists() assert len(nsmc.train.texts) == 150000 assert len(nsmc.train.labels) == 150000 assert len(nsmc.test.texts) == 50000 diff --git a/tests/test_question_pair.py b/tests/test_question_pair.py index 9ab1854..9ce470b 100644 --- a/tests/test_question_pair.py +++ b/tests/test_question_pair.py @@ -3,6 +3,7 @@ def test_usage(): pair = Korpora.load('question_pair') + assert pair.exists() assert len(pair.train) == len(QuestionPairKorpus().train) pair.train[0] pair.test[0]