Skip to content

Commit

Permalink
Separate api calls for secondary summaries (#2366)
Browse files Browse the repository at this point in the history
* Seperate prompt generation for sector and component

* Format prompt and create migration file
  • Loading branch information
susilnem authored and sudip-khanal committed Jan 6, 2025
1 parent 7b78e6f commit 758eeb9
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 46 deletions.
20 changes: 20 additions & 0 deletions per/migrations/0124_alter_opslearningpromptresponsecache_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 4.2.17 on 2025-01-03 03:07

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("per", "0123_alter_perdocumentupload_file_alter_perfile_file"),
]

operations = [
migrations.AlterField(
model_name="opslearningpromptresponsecache",
name="type",
field=models.IntegerField(
choices=[(1, "Primary"), (2, "Secondary"), (3, "Sector"), (4, "Component")], verbose_name="type"
),
),
]
2 changes: 2 additions & 0 deletions per/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,8 @@ class OpsLearningPromptResponseCache(models.Model):
class PromptType(models.IntegerChoices):
PRIMARY = 1, _("Primary")
SECONDARY = 2, _("Secondary")
SECTOR = 3, _("Sector")
COMPONENT = 4, _("Component")

prompt_hash = models.CharField(verbose_name=_("used prompt hash"), max_length=32)
prompt = models.TextField(verbose_name=_("used prompt"), null=True, blank=True)
Expand Down
130 changes: 86 additions & 44 deletions per/ops_learning_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class OpsLearningSummaryTask:
MIN_DIF_EXCERPTS = 3

primary_prompt = (
"Please aggregate and summarize the provided data into UP TO THREE structured paragraphs.\n"
"\n Please aggregate and summarize the provided data into UP TO THREE structured paragraphs.\n"
"The output MUST strictly adhere to the format below:\n"
"- *Title*: Each finding should begin with the main finding TITLE in bold.\n"
"Should be a high level summary of the finding below. "
Expand Down Expand Up @@ -87,11 +87,32 @@ class OpsLearningSummaryTask:
'"contradictory reports": "..."}'
)

secondary_prompt = (
"Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
component_prompt = (
"\n Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
"The output SHOULD ALWAYS follow the format below:\n"
"- *Type*: Whether the paragraph is related to a 'sector' or a 'component'\n"
"- *Subtype*: Provides the name of the sector or of the component to which the paragraph refers.\n"
"- *Type*: 'component'\n"
"- *Subtype*: Provides the name of the component to which the paragraph refers.\n"
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
"*Content*: A short summary aggregating findings related to the Subtype, "
"so that they are supported by evidence coming from more than one report, "
"and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports "
"it from the data available from multiples reports or items, include year and country of the evidence. "
"The length of each paragraph MUST be between 20 and 30 words.\n"
" Important:\n\n"
"- ONLY create one summary per subtype\n"
"- DO NOT mention the ids of the excerpts in the content of the summary.\n"
"- DO NOT use data from any source other than the one provided.\n\n"
"Output Format:\n"
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY.\n"
'{"0": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, '
'"1": {"type": "component", "subtype": "Logistics", "excerpts id":"45, 678", "content": "lorem ipsum"}}'
)

sector_prompt = (
"\n Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
"The output SHOULD ALWAYS follow the format below:\n"
"- *Type*: 'sector'\n"
"- *Subtype*: Provides the name of the sector to which the paragraph refers.\n"
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
"*Content*: A short summary aggregating findings related to the Subtype, "
"so that they are supported by evidence coming from more than one report, "
Expand All @@ -105,8 +126,7 @@ class OpsLearningSummaryTask:
"Output Format:\n"
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY.\n"
'{"0": {"type": "sector", "subtype": "shelter", "excerpts id":"43, 1375, 14543", "content": "lorem ipsum"}, '
'"1": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, '
'"2": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}'
'"1": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}'
)

system_message = (
Expand Down Expand Up @@ -686,44 +706,61 @@ def process_learnings_component(component, df, max_length_per_section):
)
return learnings_component

def _build_data_section(secondary_df: pd.DataFrame):
# Secondary learnings section
sectors = get_main_sectors(secondary_df)
def _build_component_data_section(secondary_df: pd.DataFrame):
# Component learnings section
components = get_main_components(secondary_df)
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT

if (len(sectors) + len(components)) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors))
if len(components) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / len(components)

learnings_sectors = (
learnings_components = (
"\n----------------\n\n"
+ "TYPE: SECTORS"
+ "TYPE: COMPONENT"
+ "\n----------------\n".join(
[process_learnings_sector(x, secondary_df, max_length_per_section) for x in sectors if pd.notna(x)]
[process_learnings_component(x, secondary_df, max_length_per_section) for x in components if pd.notna(x)]
)
)
learnings_components = (
secondary_learnings_data = learnings_components
return secondary_learnings_data

def _build_sector_data_section(secondary_df: pd.DataFrame):
# Sector learnings section
sectors = get_main_sectors(secondary_df)
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT

if len(sectors) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / len(sectors)

learnings_sectors = (
"\n----------------\n\n"
+ "TYPE: COMPONENT"
+ "TYPE: SECTORS"
+ "\n----------------\n".join(
[process_learnings_component(x, secondary_df, max_length_per_section) for x in components if pd.notna(x)]
[process_learnings_sector(x, secondary_df, max_length_per_section) for x in sectors if pd.notna(x)]
)
)
secondary_learnings_data = learnings_sectors + learnings_components
secondary_learnings_data = learnings_sectors
return secondary_learnings_data

# Prompt intro section
prompt_intro = cls._build_intro_section()
secondary_prompt_instruction = cls._build_instruction_section(
filter_data, secondary_learning_df, cls.secondary_instruction_prompt
)
secondary_learnings_data = _build_data_section(secondary_learning_df)

# Sector Prompt and Data
sector_prompt_instruction = cls._build_instruction_section(filter_data, secondary_learning_df, cls.sector_prompt)
sector_learning_data = _build_sector_data_section(secondary_learning_df)

# Components Prompt and Data
component_prompt_instruction = cls._build_instruction_section(filter_data, secondary_learning_df, cls.component_prompt)
component_learning_data = _build_component_data_section(secondary_learning_df)

# format the prompts
secondary_learning_prompt = "".join(
[prompt_intro, secondary_prompt_instruction, secondary_learnings_data, cls.secondary_prompt]
sector_learning_prompt = "".join([prompt_intro, sector_prompt_instruction, sector_learning_data, cls.sector_prompt])
component_learning_prompt = "".join(
[prompt_intro, component_prompt_instruction, component_learning_data, cls.sector_prompt]
)

logger.info("Secondary Prompt formatted.")
return secondary_learning_prompt
return sector_learning_prompt, component_learning_prompt

@classmethod
def generate_summary(cls, prompt, type: OpsLearningPromptResponseCache.PromptType) -> dict:
Expand Down Expand Up @@ -849,8 +886,10 @@ def _modify_summary(summary: dict) -> dict:

@classmethod
def _get_or_create_summary(
cls, prompt: str, prompt_hash: str, type: OpsLearningPromptResponseCache.PromptType, overwrite_prompt_cache: bool = False
cls, prompt: str, type: OpsLearningPromptResponseCache.PromptType, overwrite_prompt_cache: bool = False
) -> dict:
"""Retrieves or Generates the summary based on the provided prompt."""
prompt_hash = OpslearningSummaryCacheHelper.generate_hash(prompt)
instance, created = OpsLearningPromptResponseCache.objects.update_or_create(
prompt_hash=prompt_hash,
type=type,
Expand Down Expand Up @@ -952,13 +991,9 @@ def get_or_create_primary_summary(
"""Retrieves or Generates the primary summary based on the provided prompt."""
logger.info("Retrieving or generating primary summary.")

# generating hash for primary prompt
primary_prompt_hash = OpslearningSummaryCacheHelper.generate_hash(primary_learning_prompt)

# Checking the response for primary prompt
primary_summary = cls._get_or_create_summary(
prompt=primary_learning_prompt,
prompt_hash=primary_prompt_hash,
type=OpsLearningPromptResponseCache.PromptType.PRIMARY,
overwrite_prompt_cache=overwrite_prompt_cache,
)
Expand All @@ -981,30 +1016,37 @@ def get_or_create_primary_summary(
def get_or_create_secondary_summary(
cls,
ops_learning_summary_instance: OpsLearningCacheResponse,
secondary_learning_prompt: str,
sector_learning_prompt: str,
component_learning_prompt: str,
overwrite_prompt_cache: bool = False,
):
"""Retrieves or Generates the summary based on the provided prompts."""
logger.info("Retrieving or generating secondary summary.")

# generating hash for secondary prompt
secondary_prompt_hash = OpslearningSummaryCacheHelper.generate_hash(secondary_learning_prompt)

# Checking the response for secondary prompt
secondary_summary = cls._get_or_create_summary(
prompt=secondary_learning_prompt,
prompt_hash=secondary_prompt_hash,
type=OpsLearningPromptResponseCache.PromptType.SECONDARY,
overwrite_prompt_cache=overwrite_prompt_cache,
)
if overwrite_prompt_cache:
logger.info("Clearing the cache for secondary summary.")
# NOTE: find a better way to update the cache
OpsLearningComponentCacheResponse.objects.filter(filter_response=ops_learning_summary_instance).delete()
OpsLearningSectorCacheResponse.objects.filter(filter_response=ops_learning_summary_instance).delete()

# Saving into the database
# Checking the response for sector prompt
sector_summary = cls._get_or_create_summary(
prompt=sector_learning_prompt,
type=OpsLearningPromptResponseCache.PromptType.SECTOR,
overwrite_prompt_cache=overwrite_prompt_cache,
)
cls.secondary_response_save_to_db(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_summary=sector_summary,
)

# Checking the response for component prompt
component_summary = cls._get_or_create_summary(
prompt=component_learning_prompt,
type=OpsLearningPromptResponseCache.PromptType.COMPONENT,
overwrite_prompt_cache=overwrite_prompt_cache,
)
cls.secondary_response_save_to_db(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_summary=secondary_summary,
secondary_summary=component_summary,
)
9 changes: 7 additions & 2 deletions per/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ def generate_ops_learning_summary(ops_learning_summary_id: int, filter_data: dic

# Prioritize excerpts for secondary insights
secondary_learning_df = OpsLearningSummaryTask.seconday_prioritize_excerpts(prioritized_learnings)

# Format secondary prompt
secondary_learning_prompt = OpsLearningSummaryTask.format_secondary_prompt(secondary_learning_df, filter_data)
sector_learning_prompt, component_learning_prompt = OpsLearningSummaryTask.format_secondary_prompt(
secondary_learning_df=secondary_learning_df, filter_data=filter_data
)

# Generate secondary summary
OpsLearningSummaryTask.get_or_create_secondary_summary(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_learning_prompt=secondary_learning_prompt,
sector_learning_prompt=sector_learning_prompt,
component_learning_prompt=component_learning_prompt,
overwrite_prompt_cache=overwrite_prompt_cache,
)

Expand Down

0 comments on commit 758eeb9

Please sign in to comment.