diff --git a/src/discord/commands/pdfs.ts b/src/discord/commands/pdfs.ts index 79cd3097..d04e327f 100644 --- a/src/discord/commands/pdfs.ts +++ b/src/discord/commands/pdfs.ts @@ -54,10 +54,20 @@ export default { }) thread.send(`PDF i kö: ${url}`) - nlmParsePDF.queue.add('download ' + url.slice(-20), { - url, - threadId: thread.id, - }) + nlmParsePDF.queue.add( + 'download ' + url.slice(-20), + { + url, + threadId: thread.id, + }, + { + backoff: { + type: 'fixed', + delay: 60_000, + }, + attempts: 10, + } + ) }) } catch (error) { console.error('Pdfs: error', error) diff --git a/src/lib/vectordb.ts b/src/lib/vectordb.ts index de452eb8..aff4b494 100644 --- a/src/lib/vectordb.ts +++ b/src/lib/vectordb.ts @@ -23,9 +23,29 @@ async function addReport(url: string, markdown: string) { .map((p) => p.trim()) .filter((p) => p.length > 0) + let prefix = '' + const mergedParagraphs: string[] = [] + + // Combine standalone headers (titles without body) with the next paragraph that has a body. + for (let i = 0; i < paragraphs.length; i++) { + const current = paragraphs[i] + const hasBody = current.split('\n').length > 1 + + if (!hasBody) { + prefix += (prefix ? '\n' : '') + current + } else { + mergedParagraphs.push((prefix ? prefix + '\n' : '') + current) + prefix = '' + } + } + + if (prefix) { + mergedParagraphs.push(prefix) + } + const documentChunks: { chunk: string; paragraph: string }[] = [] - paragraphs.forEach((paragraph) => { + mergedParagraphs.forEach((paragraph) => { for (let i = 0; i < paragraph.length; i += CHUNK_SIZE - overlapSize) { const chunk = paragraph.slice(i, i + CHUNK_SIZE).trim() if (chunk.length > 0) { diff --git a/src/prompts/followUp/scope12.ts b/src/prompts/followUp/scope12.ts index 4795bdcf..02c02163 100644 --- a/src/prompts/followUp/scope12.ts +++ b/src/prompts/followUp/scope12.ts @@ -43,7 +43,7 @@ NEVER CALCULATE ANY EMISSIONS. ONLY REPORT THE DATA AS IT IS IN THE PDF. If you Example - feel free to add more fields and relevant data: { "scope12": [{ - "year": 2021, + "year": 2023, "scope1": { "total": 12.3 }, diff --git a/src/workers/nlmParsePDF.ts b/src/workers/nlmParsePDF.ts index ceb0ae18..a174d3ab 100644 --- a/src/workers/nlmParsePDF.ts +++ b/src/workers/nlmParsePDF.ts @@ -44,6 +44,17 @@ const nlmParsePDF = new DiscordWorker( let json try { json = await extractJsonFromPdf(pdf) + } catch (err) { + if (job.attemptsMade < (job.opts?.attempts || 10)) { + job.editMessage( + `❌ Fel vid tolkning av PDF: ${err.message}. Försöker igen om en stund...` + ) + } else { + job.editMessage( + `❌ Fel vid tolkning av PDF: ${err.message}. Ger upp...` + ) + } + throw new Error('Failed to parse PDF, retrying in one minute...') } finally { clearInterval(interval) }