Skip to content

Commit

Permalink
fix: improve citation logic (#578) bump:patch
Browse files Browse the repository at this point in the history
  • Loading branch information
taprosoft authored Dec 18, 2024
1 parent 3bd19f3 commit 4fe0807
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 26 deletions.
15 changes: 12 additions & 3 deletions libs/kotaemon/kotaemon/indices/qa/citation_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,19 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
highlight_text = ""

ss = sorted(ss, key=lambda x: x["start"])
last_end = 0
text = cur_doc.text[: ss[0]["start"]]

for idx, span in enumerate(ss):
to_highlight = cur_doc.text[span["start"] : span["end"]]
if len(to_highlight) > len(highlight_text):
highlight_text = to_highlight
# prevent overlapping between span
span_start = max(last_end, span["start"])
span_end = max(last_end, span["end"])

to_highlight = cur_doc.text[span_start:span_end]
last_end = span_end

# append to highlight on PDF viewer
highlight_text += (" " if highlight_text else "") + to_highlight

span_idx = span.get("idx", None)
if span_idx is not None:
Expand All @@ -350,6 +358,7 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
)
if idx < len(ss) - 1:
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]

text += cur_doc.text[ss[-1]["end"] :]
# add to display list
with_citation.append(
Expand Down
8 changes: 6 additions & 2 deletions libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def answer_to_citations(self, answer) -> list[InlineEvidence]:
def replace_citation_with_link(self, answer: str):
# Define the regex pattern to match 【number】
pattern = r"【\d+】"
alternate_pattern = r"\[\d+\]"

# Regular expression to match merged citations
multi_pattern = r"【([\d,\s]+)】"
Expand All @@ -166,19 +167,22 @@ def split_citations(match):
answer = re.sub(multi_pattern, split_citations, answer)

# Find all citations in the answer
matches = re.finditer(pattern, answer)
matches = list(re.finditer(pattern, answer))
if not matches:
matches = list(re.finditer(alternate_pattern, answer))

matched_citations = set()
for match in matches:
citation = match.group()
matched_citations.add(citation)

for citation in matched_citations:
citation_id = citation[1:-1]
answer = answer.replace(
citation,
(
"<a href='#' class='citation' "
f"id='mark-{citation[1:-1]}'>{citation}</a>"
f"id='mark-{citation_id}'>{citation_id}</a>"
),
)

Expand Down
35 changes: 28 additions & 7 deletions libs/kotaemon/kotaemon/indices/qa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,38 @@ def find_text(search_span, context, min_length=5):
sentence_list = search_span.split("\n")
context = context.replace("\n", " ")

matches = []
matches_span = []
# don't search for small text
if len(search_span) > min_length:
for sentence in sentence_list:
match = SequenceMatcher(
None, sentence, context, autojunk=False
).find_longest_match()
if match.size > max(len(sentence) * 0.35, min_length):
matches.append((match.b, match.b + match.size))
match_results = SequenceMatcher(
None,
sentence,
context,
autojunk=False,
).get_matching_blocks()

matched_blocks = []
for _, start, length in match_results:
if length > max(len(sentence) * 0.2, min_length):
matched_blocks.append((start, start + length))

if matched_blocks:
start_index = min(start for start, _ in matched_blocks)
end_index = max(end for _, end in matched_blocks)
length = end_index - start_index

if length > max(len(sentence) * 0.35, min_length):
matches_span.append((start_index, end_index))

if matches_span:
# merge all matches into one span
final_span = min(start for start, _ in matches_span), max(
end for _, end in matches_span
)
matches_span = [final_span]

return matches
return matches_span


def find_start_end_phrase(
Expand Down
10 changes: 4 additions & 6 deletions libs/ktem/ktem/assets/css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ span.icon {
}

pdfjs-viewer-element {
height: 100vh;
height: 100dvh;
}

Expand All @@ -290,9 +289,8 @@ pdfjs-viewer-element {
left: 0;
top: 0;
width: 100%;
height: 100%;
overflow: auto;
background-color: rgb(0, 0, 0);
height: 85dvh;
overflow: hidden;
background-color: rgba(0, 0, 0, 0.4);
}

Expand All @@ -302,7 +300,7 @@ pdfjs-viewer-element {

.modal-content {
background-color: #fefefe;
height: 110%;
height: 100%;
display: flex;
flex-direction: column;
}
Expand All @@ -323,7 +321,7 @@ pdfjs-viewer-element {

.modal-body {
flex: 1;
overflow: auto;
overflow: hidden;
}

/* Switch checkbox styles */
Expand Down
21 changes: 17 additions & 4 deletions libs/ktem/ktem/assets/js/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ function run() {
globalThis.toggleChatColumn = (() => {
/* get flex-grow value of chat_column */
let flex_grow = conv_column.style.flexGrow;
console.log("chat col", flex_grow);
if (flex_grow == '0') {
conv_column.style.flexGrow = '1';
conv_column.style.minWidth = default_conv_column_min_width;
Expand Down Expand Up @@ -95,10 +94,24 @@ function run() {
event.preventDefault(); // Prevent the default link behavior
var citationId = event.target.getAttribute('id');

await sleep(100); // Sleep for 500 milliseconds
await sleep(100); // Sleep for 100 milliseconds

// check if modal is open
var modal = document.getElementById("pdf-modal");
var citation = document.querySelector('mark[id="' + citationId + '"]');
if (citation) {
citation.scrollIntoView({ behavior: 'smooth' });

if (modal.style.display == "block") {
// trigger on click event of PDF Preview link
var detail_elem = citation;
// traverse up the DOM tree to find the parent element with tag detail
while (detail_elem.tagName.toLowerCase() != "details") {
detail_elem = detail_elem.parentElement;
}
detail_elem.getElementsByClassName("pdf-link").item(0).click();
} else {
if (citation) {
citation.scrollIntoView({ behavior: 'smooth' });
}
}
}
}
47 changes: 43 additions & 4 deletions libs/ktem/ktem/assets/js/pdf_viewer.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,52 @@ function onBlockLoad () {
modal.style.position = "fixed";
modal.style.width = "70%";
modal.style.left = "15%";
modal.style.height = "100dvh";
} else {
modal.style.position = old_position;
modal.style.width = old_width;
modal.style.left = old_left;
modal.style.height = "85dvh";
}
};
}

globalThis.compareText = (search_phrase, page_label) => {
var iframe = document.querySelector("#pdf-viewer").iframe;
var innerDoc = (iframe.contentDocument) ? iframe.contentDocument : iframe.contentWindow.document;

var query_selector = (
"#viewer > div[data-page-number='" +
page_label +
"'] > div.textLayer > span"
);
var page_spans = innerDoc.querySelectorAll(query_selector);
for (var i = 0; i < page_spans.length; i++) {
var span = page_spans[i];
if (
span.textContent.length > 4 &&
(
search_phrase.includes(span.textContent) ||
span.textContent.includes(search_phrase)
)
) {
span.innerHTML = "<span class='highlight selected'>" + span.textContent + "</span>";
} else {
// if span is already highlighted, remove it
if (span.querySelector(".highlight")) {
span.innerHTML = span.textContent;
}
}
}
}

// Sleep function using Promise and setTimeout
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}

// Function to open modal and display PDF
globalThis.openModal = (event) => {
globalThis.openModal = async (event) => {
event.preventDefault();
var target = event.currentTarget;
var src = target.getAttribute("data-src");
Expand All @@ -66,8 +102,8 @@ function onBlockLoad () {
if (current_src != src) {
pdfViewer.setAttribute("src", src);
}
pdfViewer.setAttribute("phrase", phrase);
pdfViewer.setAttribute("search", search);
// pdfViewer.setAttribute("phrase", phrase);
// pdfViewer.setAttribute("search", search);
pdfViewer.setAttribute("page", page);

var scrollableDiv = document.getElementById("chat-info-panel");
Expand All @@ -80,6 +116,10 @@ function onBlockLoad () {
info_panel.style.display = "none";
}
scrollableDiv.scrollTop = 0;

/* search for text inside PDF page */
await sleep(500);
compareText(search, page);
}

globalThis.assignPdfOnclickEvent = () => {
Expand All @@ -93,7 +133,6 @@ function onBlockLoad () {
var created_modal = document.getElementById("pdf-viewer");
if (!created_modal) {
createModal();
console.log("Created modal")
}

}

0 comments on commit 4fe0807

Please sign in to comment.