Skip to content

Commit

Permalink
Merge pull request #372 from Klimatbyran/staging
Browse files Browse the repository at this point in the history
Reduce memory usage by serialising pdf-converter
  • Loading branch information
Greenheart authored Dec 2, 2024
2 parents dc3c966 + d127d39 commit 2cdf683
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 34 deletions.
66 changes: 35 additions & 31 deletions src/lib/pdfTools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,50 +98,54 @@ export async function extractTablesFromJson(
json: ParsedDocument,
outputDir: string,
searchTerms: string[]
): Promise<{ pages: { pageIndex: number; filename: string }[] }> {
const pdfConverter = (height: number, width: number) => {
return fromBuffer(pdf, {
density: 600,
format: 'png',
width,
height,
preserveAspectRatio: true,
})
}

): Promise<{ pages: { pageNumber: number; filename: string }[] }> {
const pages = Object.values(
findRelevantTablesGroupedOnPages(json, searchTerms)
)
if (!pages.length) return { pages: [] }

const width = pages[0].pageWidth * 2
const height = pages[0].pageHeight * 2

const pdfConverter = fromBuffer(pdf, {
density: 600,
width,
height,
format: 'png',
preserveAspectRatio: true,
})

const reportId = crypto.randomUUID()
const filenames = await Promise.all(
pages.map(async ({ pageIndex, pageHeight, pageWidth }) => {
const pageNumber = pageIndex + 1
const pageScreenshotPath = path.join(
outputDir,
`${reportId}-page-${pageNumber}.png`
const filenames: { pageNumber: number; filename: string }[] = []

for (const { pageIndex } of pages) {
const pageNumber = pageIndex + 1
const pageScreenshotPath = path.join(
outputDir,
`${reportId}-page-${pageNumber}.png`
)
const result = await pdfConverter(pageNumber, { responseType: 'buffer' })

if (!result.buffer) {
throw new Error(
`Failed to convert pageNumber ${pageNumber} to a buffer\n` +
JSON.stringify(result, null, 2)
)
const convert = pdfConverter(pageHeight * 2, pageWidth * 2)
const result = await convert(pageNumber, { responseType: 'buffer' })
}

if (!result.buffer) {
throw new Error(
`Failed to convert pageNumber ${pageNumber} to a buffer\n` +
JSON.stringify(result, null, 2)
)
}
await writeFile(pageScreenshotPath, result.buffer)

await writeFile(pageScreenshotPath, result.buffer)
return { pageIndex, filename: pageScreenshotPath }
filenames.push({ pageNumber, filename: pageScreenshotPath })

/* Denna fungerar inte än pga boundingbox är fel pga en bugg i NLM ingestor BBOX (se issue här: https://github.com/nlmatics/nlm-ingestor/issues/66).
/* Denna fungerar inte än pga boundingbox är fel pga en bugg i NLM ingestor BBOX (se issue här: https://github.com/nlmatics/nlm-ingestor/issues/66).
När den är fixad kan denna användas istället för att beskära hela sidan. */
/* TODO: fixa boundingbox för tabeller
/* TODO: fixa boundingbox för tabeller
const { x, y, width, height } = calculateBoundingBoxForTable(
table,
pageWidth,
pageHeight
)*/
})
)
}

return { pages: filenames }
}
6 changes: 3 additions & 3 deletions src/workers/nlmExtractTables.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,18 @@ const nlmExtractTables = new DiscordWorker(
)

const tables: { page_idx: number; markdown: string }[] =
await pages.reduce(async (resultsPromise, { pageIndex, filename }) => {
await pages.reduce(async (resultsPromise, { pageNumber, filename }) => {
const results = await resultsPromise
const lastPageMarkdown = results.at(-1)?.markdown || ''
const markdown = await extractTextViaVisionAPI(
{ filename, name: `Tables from page ${pageIndex}` },
{ filename, name: `Tables from page ${pageNumber}` },
lastPageMarkdown
)
// TODO: Send to s3 bucket (images)
return [
...results,
{
page_idx: Number(pageIndex),
page_idx: Number(pageNumber - 1),
markdown,
},
]
Expand Down

0 comments on commit 2cdf683

Please sign in to comment.