From 84f927c5b04bb8c1e9434d1a49f526adbd800b6d Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 9 Aug 2024 18:16:02 +0200 Subject: [PATCH] Quartz sync: Aug 9, 2024, 6:16 PM --- .gitignore | 1 + ...ution of Order, from Atoms to Economies.md | 5 +++ ...ational Economic Order A Reintroduction.md | 5 +++ ...ology Design Patterns . org (ODP) - Odp.md | 5 +++ content/Scripts/file_to_text.py | 19 ++++++++ content/Scripts/pdf_to_txt.py | 17 ------- content/Scripts/prompts.py | 35 +++++++++++++-- content/Scripts/text_to_prolog.py | 27 ++++++------ content/Scripts/utils.py | 44 ++++++++++++++++--- 9 files changed, 117 insertions(+), 41 deletions(-) create mode 100644 content/References/book/Why Information Grows The Evolution of Order, from Atoms to Economies.md create mode 100644 content/References/journalArticle/The New International Economic Order A Reintroduction.md create mode 100644 content/References/webpage/Ontology Design Patterns . org (ODP) - Odp.md create mode 100644 content/Scripts/file_to_text.py delete mode 100644 content/Scripts/pdf_to_txt.py diff --git a/.gitignore b/.gitignore index 1970163..159f6d4 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ replit.nix .vscode content/excalibrain.md content/Scripts/.env +StructuredReferences/ \ No newline at end of file diff --git a/content/References/book/Why Information Grows The Evolution of Order, from Atoms to Economies.md b/content/References/book/Why Information Grows The Evolution of Order, from Atoms to Economies.md new file mode 100644 index 0000000..64b17b2 --- /dev/null +++ b/content/References/book/Why Information Grows The Evolution of Order, from Atoms to Economies.md @@ -0,0 +1,5 @@ +[🇿](zotero://select/library/items/LUXMDBMB) + +[[Entries/Individuals/Cesar Hidalgo]] +# Why Information Grows: The Evolution of Order, from Atoms to Economies (2015) + diff --git a/content/References/journalArticle/The New International Economic Order A Reintroduction.md b/content/References/journalArticle/The New International Economic Order A Reintroduction.md new file mode 100644 index 0000000..0f6acc5 --- /dev/null +++ b/content/References/journalArticle/The New International Economic Order A Reintroduction.md @@ -0,0 +1,5 @@ +[🇿](zotero://select/library/items/M4HCQDKN) + +[[Entries/Individuals/Nils Gilman]] +# The New International Economic Order: A Reintroduction (NaN) + diff --git a/content/References/webpage/Ontology Design Patterns . org (ODP) - Odp.md b/content/References/webpage/Ontology Design Patterns . org (ODP) - Odp.md new file mode 100644 index 0000000..9a9d5d1 --- /dev/null +++ b/content/References/webpage/Ontology Design Patterns . org (ODP) - Odp.md @@ -0,0 +1,5 @@ +[🇿](zotero://select/library/items/DXEZ96HF) + + +# Ontology Design Patterns . org (ODP) - Odp + diff --git a/content/Scripts/file_to_text.py b/content/Scripts/file_to_text.py new file mode 100644 index 0000000..093ced0 --- /dev/null +++ b/content/Scripts/file_to_text.py @@ -0,0 +1,19 @@ +import sys +import pylibmagic +from unstructured.partition.auto import partition + +def file_to_text(file_path, txt_path): + elements = partition(filename=file_path) + with open(txt_path, 'w') as f: + for el in elements: + f.write(str(el)) + f.write("\n") + +if len(sys.argv) < 3: + print("Usage: python file_to_text.py input_file output_txt") + sys.exit(1) + +pdf_path = sys.argv[1] +txt_path = sys.argv[2] + +file_to_text(file_path, txt_path) diff --git a/content/Scripts/pdf_to_txt.py b/content/Scripts/pdf_to_txt.py deleted file mode 100644 index c33633b..0000000 --- a/content/Scripts/pdf_to_txt.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys -from pypdf import PdfReader - -def pdf_to_txt(pdf_path, txt_path): - with open(pdf_path, 'rb') as pdf_file, open(txt_path, 'w', encoding='utf-8') as txt_file: - pdf_reader = PdfReader(pdf_file) - for page in pdf_reader.pages: - txt_file.write(page.extract_text()) - -if len(sys.argv) < 3: - print("Usage: python pdf_to_txt.py input_pdf output_txt") - sys.exit(1) - -pdf_path = sys.argv[1] -txt_path = sys.argv[2] - -pdf_to_txt(pdf_path, txt_path) diff --git a/content/Scripts/prompts.py b/content/Scripts/prompts.py index 413ea6e..e2f3045 100644 --- a/content/Scripts/prompts.py +++ b/content/Scripts/prompts.py @@ -1,9 +1,36 @@ -def correctness_check_prompt(domain_subjects): - return f'''You are a domain expert in the field of {domain_subjects}. - Check the Prolog code for correctness based on the text. Ensure all relationships are logically sound and perfectly consistent with the text. +def correctness_check_prompt(): + # optionally add You are a domain expert in the field of {domain_subjects}. + return f'''Check the Prolog code for correctness and completeness based on the text. Ensure all relationships are logically sound and perfectly consistent with the text. If you find any inconsistencies, correct them in the Prolog code. + If anything is missing, add missing predicates. + + + Please ONLY use the following predicates from the GFO ontology: - Please respond with prolog code only.''' + Classes: + Abstract Action Amount of substrate Awareness level Biological level Category Change Chemical level Chronoid Concept Concrete Configuration Configuroid Continuous Continuous change Continuous process Dependent Discrete Discrete presential Discrete process Entity Extrinsic change Function History Independent Individual Instantanuous change Intrinsic change Item Level Line Mass entity Material boundary Material line Material object Material persistant Material point Material stratum Material structure Material surface Mental stratum Occurrent Ontological layer Persistant Personality level Physical level Point Presential Process Processual role Property Property value Relational role Relator Role Set Situation Situoid Social role Social stratum Space Space time Spatial boundary Spatial region State Stratum Surface Symbol Symbol sequence Symbol structure Temporal region Time Time boundary Token Topoid Universal Value space + + Object Properties: + abstract has part abstract part of agent in boundary of categorial part of category in layer caused by causes constituent part of depends on exists at framed by frames function determinant of functional item of goal of has boundary has categorial part has category has constituent part has function has function determinant has functional item has goal has left time boundary has member has part has participant has proper part has requirement has right time boundary has sequence constituent has spatial boundary has time boundary has token has value instance of instantiated by layer of left boundary of level of member of necessary for occupied by occupies on layer on level on stratum part of participates in plays role projection of projects to proper part of realized by realizes requirement of right boundary of role of sequence constituent of spatial boundary of stratum of time boundary of value of + + Please respond with prolog code only. + ''' + +arity_two_prompt = '''You are an expert at creating Knowledge Graphs in Prolog. +Translate sentences in the text into Prolog code using predicates of arity 2. +Arity 2 predicates define relationships (verbs) between nouns, they are provided below. + +You can ONLY use the following predicates: + +Classes: + Abstract Action Amount of substrate Awareness level Biological level Category Change Chemical level Chronoid Concept Concrete Configuration Configuroid Continuous Continuous change Continuous process Dependent Discrete Discrete presential Discrete process Entity Extrinsic change Function History Independent Individual Instantanuous change Intrinsic change Item Level Line Mass entity Material boundary Material line Material object Material persistant Material point Material stratum Material structure Material surface Mental stratum Occurrent Ontological layer Persistant Personality level Physical level Point Presential Process Processual role Property Property value Relational role Relator Role Set Situation Situoid Social role Social stratum Space Space time Spatial boundary Spatial region State Stratum Surface Symbol Symbol sequence Symbol structure Temporal region Time Time boundary Token Topoid Universal Value space + +Object Properties: + abstract has part abstract part of agent in boundary of categorial part of category in layer caused by causes constituent part of depends on exists at framed by frames function determinant of functional item of goal of has boundary has categorial part has category has constituent part has function has function determinant has functional item has goal has left time boundary has member has part has participant has proper part has requirement has right time boundary has sequence constituent has spatial boundary has time boundary has token has value instance of instantiated by layer of left boundary of level of member of necessary for occupied by occupies on layer on level on stratum part of participates in plays role projection of projects to proper part of realized by realizes requirement of right boundary of role of sequence constituent of spatial boundary of stratum of time boundary of value of + +Please respond with prolog code only. +Text: +''' relation_prompt = '''You are an expert at creating Knowledge Graphs in Prolog. Translate sentences in the text into Prolog code using predicates of arity 2. diff --git a/content/Scripts/text_to_prolog.py b/content/Scripts/text_to_prolog.py index 9d70e7f..d4eb9fc 100644 --- a/content/Scripts/text_to_prolog.py +++ b/content/Scripts/text_to_prolog.py @@ -1,29 +1,30 @@ import sys from utils import parse_prolog_predicates, call_gpt_api, relation_correctness_check, file_to_chunks, text_to_relations, prolog_predicates_to_entities, entities_to_categorized_entities -from prompts import relation_prompt, categories_prompt, correctness_check_prompt +from prompts import relation_prompt, categories_prompt, correctness_check_prompt, arity_two_prompt if len(sys.argv) < 3: - print("Usage: python text_to_prolog.py input_file domain_subjects") + print("Usage: python text_to_prolog.py input_file output_file domain_subjects") sys.exit(1) input_file = sys.argv[1] # e.g. 'nieo.txt' -domain_subjects = sys.argv[2] # e.g. 'NIEO, international relations, economics' -name = input_file.split('.')[0] +output_file = sys.argv[2] # e.g. 'nieo.pl' +# domain_subjects = sys.argv[3] # e.g. 'NIEO, international relations, economics' +output_file_noext = output_file.split('.')[0] original_text_chunk_size = 2000 original_text_chunks = file_to_chunks(input_file, original_text_chunk_size) -relation_output_file = f'{name}_relations.pl' -entities_output_file = f'{name}_entities.pl' -categories_output_file = f'{name}_categories.pl' +relation_output_file = f'{output_file_noext}_relations.pl' +entities_output_file = f'{output_file_noext}_entities.pl' +categories_output_file = f'{output_file_noext}_categories.pl' -output_relations = text_to_relations(original_text_chunks, relation_output_file, relation_prompt, correctness_check_prompt(domain_subjects)) +output_relations = text_to_relations(original_text_chunks, relation_output_file, arity_two_prompt, correctness_check_prompt()) -entity_predicates = parse_prolog_predicates(output_relations) +# entity_predicates = parse_prolog_predicates(output_relations) -prolog_predicates_to_entities(entity_predicates, entities_output_file) +# prolog_predicates_to_entities(entity_predicates, entities_output_file) -entities_chunk_size = 2000 -entities_chunks = file_to_chunks(entities_output_file, entities_chunk_size) +# entities_chunk_size = 2000 +# entities_chunks = file_to_chunks(entities_output_file, entities_chunk_size) -entities_to_categorized_entities(entities_chunks, categories_output_file, categories_prompt) \ No newline at end of file +# entities_to_categorized_entities(entities_chunks, categories_output_file, categories_prompt) \ No newline at end of file diff --git a/content/Scripts/utils.py b/content/Scripts/utils.py index 09d5933..8fb80b2 100644 --- a/content/Scripts/utils.py +++ b/content/Scripts/utils.py @@ -3,10 +3,14 @@ from textwrap import wrap from openai import OpenAI from dotenv import load_dotenv +import anthropic load_dotenv() -client = OpenAI( +claude_client = anthropic.Anthropic( + api_key=os.getenv("ANTHROPIC_API_KEY"), +) +gpt_client = OpenAI( api_key=os.getenv("OPENAI_API_KEY"), ) @@ -20,7 +24,7 @@ def file_to_chunks(input_file, chunk_size): def call_gpt_api(chunk, prompt): """Call the GPT API with the given chunk and prompt.""" try: - response = client.chat.completions.create( + response = gpt_client.chat.completions.create( messages=[ { "role": "system", @@ -37,6 +41,30 @@ def call_gpt_api(chunk, prompt): except Exception as e: return f"Error: {str(e)}" +def call_claude_api(chunk, prompt): + try: + message = claude_client.messages.create( + system=prompt, + model="claude-3-5-sonnet-20240620", + max_tokens=len(chunk), + temperature=0, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": chunk + } + ] + } + ] + ) + return message.content[0].text + except Exception as e: + return f"Error: {str(e)}" + + def relation_correctness_check(text, prolog_code, prompt): correctness_check_prompt = f'''{prompt} @@ -45,7 +73,8 @@ def relation_correctness_check(text, prolog_code, prompt): Original Text:''' - return call_gpt_api(text, correctness_check_prompt) + # return call_gpt_api(text, correctness_check_prompt) + return call_claude_api(text, correctness_check_prompt) def text_to_relations(chunks, output_file, prompt, correctness_check_prompt): """Extract relations from a list of chunks and save to output file.""" @@ -53,10 +82,11 @@ def text_to_relations(chunks, output_file, prompt, correctness_check_prompt): with open(output_file, 'w') as f: for i, chunk in enumerate(chunks): print(f"Processing relations chunk {i+1}/{len(chunks)}") - preliminary_result = call_gpt_api(chunk, prompt) - result = relation_correctness_check(chunk, preliminary_result, correctness_check_prompt) - f.write(f"% Chunk {i+1}\n{result}\n\n") - complete_output += result + "\n" + # preliminary_result = call_gpt_api(chunk, prompt) + preliminary_result = call_claude_api(chunk, prompt) + # result = relation_correctness_check(chunk, preliminary_result, correctness_check_prompt) + f.write(f"% Chunk {i+1}\n{preliminary_result}\n\n") + complete_output += preliminary_result + "\n" return complete_output