-
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3b880d0
commit 9db9be4
Showing
9 changed files
with
436 additions
and
270 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from wiktionary_de_parser.dump_processor import WiktionaryDump\n", | ||
"\n", | ||
"\n", | ||
"dump = WiktionaryDump(dump_dir_path=\"tmp\")\n", | ||
"dump.download_dump()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from wiktionary_de_parser import WiktionaryParser\n", | ||
"\n", | ||
"parser = WiktionaryParser()\n", | ||
"pages = []\n", | ||
"\n", | ||
"for page in dump.pages():\n", | ||
" if page.redirect_to:\n", | ||
" continue\n", | ||
"\n", | ||
" # if page.name != \"ordo\":\n", | ||
" # continue\n", | ||
"\n", | ||
" pages.append(page)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "dfeb5760f9204baf92c8f07b37cda41d", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Processing chunks: 0%| | 0/27 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"from tqdm.notebook import tqdm\n", | ||
"from wiktionary_de_parser import WiktionaryParser\n", | ||
"from concurrent.futures import ProcessPoolExecutor\n", | ||
"import multiprocessing as mp\n", | ||
"\n", | ||
"from wiktionary_de_parser.models import MeaningDict\n", | ||
"\n", | ||
"\n", | ||
"def chunks(lst, n):\n", | ||
" \"\"\"Split list into n chunks\"\"\"\n", | ||
" chunk_size = len(lst) // n + (1 if len(lst) % n else 0)\n", | ||
" for i in range(0, len(lst), chunk_size):\n", | ||
" yield lst[i : i + chunk_size]\n", | ||
"\n", | ||
"\n", | ||
"def process_chunk(pages_chunk):\n", | ||
" local_dict = {}\n", | ||
" parser = WiktionaryParser()\n", | ||
"\n", | ||
" for page in pages_chunk:\n", | ||
" for entry in parser.entries_from_page(page):\n", | ||
" entry_parsed = parser.parse_entry(entry, include_meanings=True)\n", | ||
"\n", | ||
" if entry_parsed.meanings is None:\n", | ||
" continue\n", | ||
"\n", | ||
" local_dict[page.name] = entry_parsed.meanings\n", | ||
"\n", | ||
" return local_dict\n", | ||
"\n", | ||
"\n", | ||
"# Get number of CPU cores (leave one free for system)\n", | ||
"n_cores = max(1, mp.cpu_count() - 1)\n", | ||
"\n", | ||
"# Split pages into chunks\n", | ||
"page_chunks = list(chunks(pages, n_cores))\n", | ||
"\n", | ||
"all_lists: dict[str, list[MeaningDict]] = {}\n", | ||
"with ProcessPoolExecutor(max_workers=n_cores) as executor:\n", | ||
" # Process chunks in parallel with progress bar\n", | ||
" futures = list(\n", | ||
" tqdm(\n", | ||
" executor.map(process_chunk, page_chunks),\n", | ||
" total=len(page_chunks),\n", | ||
" desc=\"Processing chunks\",\n", | ||
" )\n", | ||
" )\n", | ||
"\n", | ||
" # Merge results\n", | ||
" for result_dict in futures:\n", | ||
" all_lists.update(result_dict)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"265321\n", | ||
"• gegenständlich: Reihe, Sitzreihe (Reihe der Sitze oder Bänke im Theater), Lage, Schicht\n", | ||
"• militärisch:\n", | ||
" • (abstrakt: Militär-Gruppierung, Legion) Glied, Zenturie, Abteilung, Abteilungsglied, Kompanie\n", | ||
" • (personifiziert) der Hauptmann selbst\n", | ||
"• politisch-gesellschaftlich: Stand, Klasse, Rang, Stellung\n", | ||
"• abstrakt: Ordnung, gehörige Reihenfolge, Anordnung, Regel\n", | ||
" • übertragen: Verfassung, Zustand\n", | ||
"• kirchlich (mittellateinisch):\n", | ||
" • Ordensgemeinschaft von Mönchen\n", | ||
" • kirchlicher Stand: Weihestufe\n", | ||
" • die (göttliche) Weltordnung\n", | ||
"• Taxonomie: Biologische Systematik (neulateinisch): Ordnung oder fachwissenschaftlich Ordo (Zusammenfassung mehrerer eng verwandter Familien, Teil eine Klasse)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from wiktionary_de_parser.parser.parse_meanings import (\n", | ||
" format_meanings,\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"print(len(all_lists))\n", | ||
"\n", | ||
"print(format_meanings(all_lists[\"ordo\"]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"word: condicional\n", | ||
"• Der Kondicional der katalanischen Sprache drückt die Möglichkeit von Tatsachen oder Aktionen aus, die implizit oder explizit von Umständen in der Vergangenheit abhängig sind. Er wird in folgenden Situationen verwendet:\n", | ||
" • Zum Ausdruck der Wahrscheinlichkeit des Eintretens eines Ereignisses in der Zukunft:\n", | ||
" • Si tinguessim més temps, acabaríem el projecte.\n", | ||
" • Mit mehr Zeit würden wir das Projekt erfolgreich beenden.\n", | ||
" • Als Ausdruck einer höflichen Bitte (condicional de cortesia):\n", | ||
" • Tindries la bondat d’ / Podries ajudar-me en aquestes negociacions ?\n", | ||
" • Wärest du so freundlich, mich bei diesen Verhandlungen zu unterstützen?\n", | ||
" • Bei einer kaschierten oder vorsichtig formulierten Aufforderung:\n", | ||
" • Podríeu treballar més acuradament!\n", | ||
" • Ihr könntet sorgfältiger arbeiten!\n", | ||
" • Bei einem höflich, zurückhaltend formulierten Wunsch:\n", | ||
" • M’agradaria fer un creuer.\n", | ||
" • Ich möchte gern einmal eine Kreuzfahrt machen.\n", | ||
" • In höflich, zurückhaltend formulierten Äußerungen (bei geringer Wahrscheinlichkeit des Eintretens):\n", | ||
" • Podria ser que tingués raó.\n", | ||
" • Es könnte sein, dass er recht hätte.\n", | ||
" • Bei großer Wahrscheinlichkeit: Keine Verwendung des Konditionals, sondern des Präsens Indikativ (Pot ser):\n", | ||
" • Pot ser que tingui raó.\n", | ||
" • Kann sein, dass er recht hat.” Oder: „Wahrscheinlich hat er recht.\n", | ||
"\n", | ||
"\n", | ||
"word: nad\n", | ||
"• Präposition mit dem Instrumental:\n", | ||
" • örtlich, auf die Frage wo: über\n", | ||
" • bei Flüssen und Seen: an\n", | ||
" • in Verbindung mit Verben und Substantiven: über\n", | ||
"\n", | ||
"\n", | ||
"word: Irrigation\n", | ||
"• Spülung eines Hohlraums im Körper (z.B. Mund, Darm, Blase) mit einer Flüssigkeit\n", | ||
" • Medizin, Darmspülung: das Einleiten einer Flüssigkeit über den After bzw. das Stoma in den Dickdarm, z. B.\n", | ||
" • zur Reinigung des Dickdarms vor einer Darmspiegelung oder Operation\n", | ||
" • zur Entleerung des Darms bei Verstopfung (Obstipation)\n", | ||
" • zur Behandlung bestimmter Krankheiten, z. B. bei entzündlicher Colitis ulcerosa\n", | ||
" • zur Regulierung der Darmtätigkeit (Peristaltik) als spezielle Spülbehandlung bei Kolostomie-Patienten\n", | ||
" • zur Entschlackung des Körpers in der Alternativmedizin\n", | ||
" • Medizin: Spülung anderer Hohlorgane und Hohlräume, wie Nase, Ohr, Mund, Blase, Scheide, Eiterherde, z. B. antiseptische Irrigation oder als supragingivale Irrigation in der Zahnmedizin\n", | ||
" • Einlauf in den Dickdarm im Zusammenhang mit bestimmten Sexualpraktiken\n", | ||
"\n", | ||
"\n", | ||
"3\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# recursively calculate the depth of the wiki_list.sublist\n", | ||
"from wiktionary_de_parser.models import MeaningDict\n", | ||
"from wiktionary_de_parser.parser.parse_meanings import format_meaning_dict\n", | ||
"\n", | ||
"\n", | ||
"def calculate_depth(list_item: MeaningDict):\n", | ||
" if not list_item.get(\"sublist\"):\n", | ||
" return 0\n", | ||
" return 1 + max(\n", | ||
" [calculate_depth(subitem) for subitem in list_item.get(\"sublist\", [])]\n", | ||
" )\n", | ||
"\n", | ||
"\n", | ||
"# Calculate the number of maximum list depth in all lists in all_lists\n", | ||
"max_depth = 0\n", | ||
"for word, lists in all_lists.items():\n", | ||
" for wiki_list_item in lists:\n", | ||
" depth = calculate_depth(wiki_list_item)\n", | ||
" if depth > max_depth:\n", | ||
" max_depth = depth\n", | ||
"\n", | ||
" if depth == 3:\n", | ||
" print(f\"word: {word}\")\n", | ||
" print(format_meaning_dict(wiki_list_item))\n", | ||
" print(\"\\n\")\n", | ||
"\n", | ||
"print(max_depth)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.