Skip to content

Commit

Permalink
Post Process Paragraphs to Improve Chunking (#385)
Browse files Browse the repository at this point in the history
* post processed paragraphs to check for single line paragraphs and merge them

* add clarifying commetns
  • Loading branch information
hugo-nl authored Dec 4, 2024
1 parent 420ac61 commit ce0392d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
22 changes: 21 additions & 1 deletion src/lib/vectordb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,29 @@ async function addReport(url: string, markdown: string) {
.map((p) => p.trim())
.filter((p) => p.length > 0)

let prefix = ''
const mergedParagraphs: string[] = []

// Combine standalone headers (titles without body) with the next paragraph that has a body.
for (let i = 0; i < paragraphs.length; i++) {
const current = paragraphs[i]
const hasBody = current.split('\n').length > 1

if (!hasBody) {
prefix += (prefix ? '\n' : '') + current
} else {
mergedParagraphs.push((prefix ? prefix + '\n' : '') + current)
prefix = ''
}
}

if (prefix) {
mergedParagraphs.push(prefix)
}

const documentChunks: { chunk: string; paragraph: string }[] = []

paragraphs.forEach((paragraph) => {
mergedParagraphs.forEach((paragraph) => {
for (let i = 0; i < paragraph.length; i += CHUNK_SIZE - overlapSize) {
const chunk = paragraph.slice(i, i + CHUNK_SIZE).trim()
if (chunk.length > 0) {
Expand Down
2 changes: 1 addition & 1 deletion src/prompts/followUp/scope12.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ NEVER CALCULATE ANY EMISSIONS. ONLY REPORT THE DATA AS IT IS IN THE PDF. If you
Example - feel free to add more fields and relevant data:
{
"scope12": [{
"year": 2021,
"year": 2023,
"scope1": {
"total": 12.3
},
Expand Down

0 comments on commit ce0392d

Please sign in to comment.