From 9f91507be6661449d91f0bf3d99b775619d9ca00 Mon Sep 17 00:00:00 2001 From: jmal Date: Wed, 31 Jul 2024 18:38:10 +0800 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9Epdf=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=B0=81=E9=9D=A2=E9=A2=84=E8=A7=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 70 ++++++-- .../jmal/clouddisk/lucene/LuceneService.java | 15 +- .../clouddisk/lucene/ReadContentService.java | 168 ++++++++++++++++++ .../lucene/ReadPDFContentService.java | 81 --------- .../clouddisk/media/VideoProcessService.java | 2 +- .../jmal/clouddisk/model/FileDocument.java | 5 + .../com/jmal/clouddisk/model/FileIntroVO.java | 4 + .../service/impl/CommonFileService.java | 14 ++ .../jmal/clouddisk/util/FileContentUtil.java | 81 +++++---- .../com/jmal/clouddisk/util/MyFileUtils.java | 2 +- .../com/jmal/clouddisk/EpubCoverTest.java | 94 ++++++++++ .../java/com/jmal/clouddisk/PdfCoverTest.java | 41 +++++ .../com/jmal/clouddisk/WordCoverTest.java | 89 ++++++++++ .../jmal/clouddisk/lucene/ReadPDFTest.java | 6 +- 14 files changed, 524 insertions(+), 148 deletions(-) create mode 100644 src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java delete mode 100644 src/main/java/com/jmal/clouddisk/lucene/ReadPDFContentService.java create mode 100644 src/test/java/com/jmal/clouddisk/EpubCoverTest.java create mode 100644 src/test/java/com/jmal/clouddisk/PdfCoverTest.java create mode 100644 src/test/java/com/jmal/clouddisk/WordCoverTest.java diff --git a/pom.xml b/pom.xml index 4bdc0512..44b8afcb 100644 --- a/pom.xml +++ b/pom.xml @@ -20,8 +20,19 @@ 3.0.0 UTF-8 9.10.0 - + + + + openkm + https://maven.openkm.com/ + + + e-iceblue + https://repo.e-iceblue.cn/repository/maven-public/ + + + @@ -206,37 +217,62 @@ hutool-all 5.8.25 + + org.apache.poi + poi + 5.3.0 + org.apache.poi poi-ooxml - 5.2.2 + 5.3.0 + + + commons-io + commons-io + 2.16.1 - xerces - xercesImpl - 2.12.2 + e-iceblue + spire.doc.free + 5.2.0 org.glassfish.jaxb jaxb-runtime provided - - org.apache.pdfbox - pdfbox - 3.0.2 - - - commons-logging - commons-logging - - - org.apache.tika tika-core - 2.5.0 + 2.9.2 + + + nl.siegmann.epublib + epublib-core + 3.1 + + + org.jsoup + jsoup + 1.18.1 + + + org.xhtmlrenderer + flying-saucer-pdf + 9.9.0 + + + fr.opensagres.xdocreport + fr.opensagres.xdocreport.converter.docx.xwpf + 2.1.0 + + com.itextpdf + html2pdf + 5.0.5 + + com.fasterxml.jackson.dataformat jackson-dataformat-xml diff --git a/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java b/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java index 3434d511..e5510db6 100644 --- a/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java +++ b/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java @@ -65,7 +65,7 @@ public class LuceneService { private final SearcherManager searcherManager; private final IUserService userService; private final TagService tagService; - private final ReadPDFContentService readPDFContentService; + private final ReadContentService readContentService; private final RebuildIndexTaskService rebuildIndexTaskService; public final static String MONGO_INDEX_FIELD = "index"; @@ -248,7 +248,7 @@ private void updateIndex(boolean readContent, FileIntroVO fileIntroVO) { setFileIndex(fileIndex); String content = null; if (readContent) { - content = readFileContent(file); + content = readFileContent(file, fileIntroVO.getId()); if (StrUtil.isBlank(content)) { rebuildIndexTaskService.incrementIndexedTaskSize(); updateIndexStatus(fileIntroVO, IndexStatus.INDEXED); @@ -333,7 +333,7 @@ private void setType(File file, FileIndex fileIndex) { } } - private String readFileContent(File file) { + private String readFileContent(File file, String fileId) { try { if (file == null) { return null; @@ -343,13 +343,16 @@ private String readFileContent(File file) { } String type = FileTypeUtil.getType(file); if ("pdf".equals(type)) { - return readPDFContentService.read(file); + return readContentService.readPdfContent(file, fileId); + } + if ("epub".equals(type)) { + return readContentService.readEpubContent(file); } if ("ppt".equals(type) || "pptx".equals(type)) { - return FileContentUtil.readPPTContent(file); + return readContentService.readPPTContent(file); } if ("doc".equals(type) || "docx".equals(type)) { - return FileContentUtil.readWordContent(file); + return readContentService.readWordContent(file); } if (fileProperties.getSimText().contains(type)) { String charset = UniversalDetector.detectCharset(file); diff --git a/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java new file mode 100644 index 00000000..c6f905cb --- /dev/null +++ b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java @@ -0,0 +1,168 @@ +package com.jmal.clouddisk.lucene; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.util.StrUtil; +import com.jmal.clouddisk.media.VideoProcessService; +import com.jmal.clouddisk.ocr.OcrService; +import com.jmal.clouddisk.service.impl.CommonFileService; +import com.jmal.clouddisk.util.FileContentUtil; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import nl.siegmann.epublib.domain.Book; +import nl.siegmann.epublib.domain.Resource; +import nl.siegmann.epublib.domain.Spine; +import nl.siegmann.epublib.epub.EpubReader; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.io.RandomAccessReadBufferedFile; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFTextShape; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.springframework.stereotype.Service; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; + +@Service +@RequiredArgsConstructor +@Slf4j +public class ReadContentService { + + private final OcrService ocrService; + + public final CommonFileService commonFileService; + + public final TaskProgressService taskProgressService; + + public final VideoProcessService videoProcessService; + + public String readPdfContent(File file, String fileId) { + try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(file))) { + + String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath())); + + if (StrUtil.isNotBlank(fileId)) { + // 生成pdf封面图像 + File coverFile = FileContentUtil.pdfCoverImage(document, videoProcessService.getVideoCacheDir(username, fileId)); + commonFileService.updateCoverFileDocument(fileId, coverFile); + } + + StringBuilder content = new StringBuilder(); + // 提取每一页的内容 + PDFTextStripper pdfStripper = new PDFTextStripper(); + for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) { + pdfStripper.setStartPage(pageNumber); + pdfStripper.setEndPage(pageNumber); + String text = pdfStripper.getText(document).trim(); + if (!text.isEmpty()) { + content.append(text); + } else { + taskProgressService.addTaskProgress(file,TaskType.OCR, pageNumber + "/" + document.getNumberOfPages()); + PDPage page = document.getPage(pageNumber - 1); + PDResources resources = page.getResources(); + for (COSName xObjectName : resources.getXObjectNames()) { + PDXObject xObject = resources.getXObject(xObjectName); + if (xObject instanceof PDImageXObject image) { + BufferedImage bufferedImage = image.getImage(); + // 将图像保存到临时文件 + String tempImageFile = ocrService.generateOrcTempImagePath(username); + ImageIO.write(bufferedImage, "png", new File(tempImageFile)); + try { + // 使用 Tesseract 进行 OCR 识别 + String ocrResult = ocrService.doOCR(tempImageFile, ocrService.generateOrcTempImagePath(username)); + content.append(ocrResult); + } finally { + // 删除临时文件 + FileUtil.del(tempImageFile); + } + } + } + } + } + return content.toString(); + } catch (IOException e) { + FileContentUtil.readFailed(file, e); + } finally { + taskProgressService.removeTaskProgress(file); + } + return null; + } + + public String readEpubContent(File file) { + try (InputStream fileInputStream = new FileInputStream(file);) { + // 打开 EPUB 文件 + EpubReader epubReader = new EpubReader(); + Book book = epubReader.readEpub(fileInputStream); + StringBuilder content = new StringBuilder(); + // 获取章节内容 + Spine spine = book.getSpine(); + for (int i = 0; i < spine.size(); i++) { + Resource resource = spine.getResource(i); + InputStream is = resource.getInputStream(); + byte[] bytes = is.readAllBytes(); + String htmlContent = new String(bytes, StandardCharsets.UTF_8); + // 使用 JSoup 解析 HTML 并提取纯文本 + Document document = Jsoup.parse(htmlContent); + String textContent = document.text(); + content.append(textContent); + is.close(); + } + return content.toString(); + } catch (IOException e) { + FileContentUtil.readFailed(file, e); + } + return null; + } + + public String readPPTContent(File file) { + try (FileInputStream fis = new FileInputStream(file.getAbsolutePath()); + XMLSlideShow ppt = new XMLSlideShow(fis)) { + StringBuilder stringBuilder = new StringBuilder(); + for (XSLFSlide slide : ppt.getSlides()) { + for (XSLFShape shape : slide.getShapes()) { + if (shape instanceof XSLFTextShape textShape) { + stringBuilder.append(textShape.getText()); + } + } + } + return stringBuilder.toString(); + } catch (IOException e) { + FileContentUtil.readFailed(file, e); + } + return null; + } + + public String readWordContent(File file) { + try (FileInputStream fis = new FileInputStream(file.getAbsolutePath()); + XWPFDocument document = new XWPFDocument(fis)) { + StringBuilder stringBuilder = new StringBuilder(); + List paragraphs = document.getParagraphs(); + for (XWPFParagraph para : paragraphs) { + stringBuilder.append(para.getText()); + } + return stringBuilder.toString(); + } catch (IOException e) { + FileContentUtil.readFailed(file, e); + } + return null; + } + +} diff --git a/src/main/java/com/jmal/clouddisk/lucene/ReadPDFContentService.java b/src/main/java/com/jmal/clouddisk/lucene/ReadPDFContentService.java deleted file mode 100644 index 92b92b67..00000000 --- a/src/main/java/com/jmal/clouddisk/lucene/ReadPDFContentService.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.jmal.clouddisk.lucene; - -import cn.hutool.core.io.FileUtil; -import com.jmal.clouddisk.ocr.OcrService; -import com.jmal.clouddisk.service.impl.CommonFileService; -import com.jmal.clouddisk.util.FileContentUtil; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.io.RandomAccessReadBufferedFile; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.text.PDFTextStripper; -import org.springframework.stereotype.Service; - -import javax.imageio.ImageIO; -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; - -@Service -@RequiredArgsConstructor -@Slf4j -public class ReadPDFContentService { - - private final OcrService ocrService; - - public final CommonFileService commonFileService; - - public final TaskProgressService taskProgressService; - - public String read(File file) { - try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(file))) { - String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath())); - StringBuilder content = new StringBuilder(); - // 提取每一页的内容 - PDFTextStripper pdfStripper = new PDFTextStripper(); - for (int pageNumber = 1; pageNumber <= document.getNumberOfPages(); pageNumber++) { - pdfStripper.setStartPage(pageNumber); - pdfStripper.setEndPage(pageNumber); - String text = pdfStripper.getText(document).trim(); - if (!text.isEmpty()) { - content.append(text); - } else { - taskProgressService.addTaskProgress(file,TaskType.OCR, pageNumber + "/" + document.getNumberOfPages()); - PDPage page = document.getPage(pageNumber - 1); - PDResources resources = page.getResources(); - for (COSName xObjectName : resources.getXObjectNames()) { - PDXObject xObject = resources.getXObject(xObjectName); - if (xObject instanceof PDImageXObject image) { - BufferedImage bufferedImage = image.getImage(); - // 将图像保存到临时文件 - String tempImageFile = ocrService.generateOrcTempImagePath(username); - ImageIO.write(bufferedImage, "png", new File(tempImageFile)); - try { - // 使用 Tesseract 进行 OCR 识别 - String ocrResult = ocrService.doOCR(tempImageFile, ocrService.generateOrcTempImagePath(username)); - content.append(ocrResult); - } finally { - // 删除临时文件 - FileUtil.del(tempImageFile); - } - } - } - } - } - return content.toString(); - } catch (IOException e) { - FileContentUtil.readFailed(file, e); - } finally { - taskProgressService.removeTaskProgress(file); - } - return null; - } - -} diff --git a/src/main/java/com/jmal/clouddisk/media/VideoProcessService.java b/src/main/java/com/jmal/clouddisk/media/VideoProcessService.java index 4e19f7ae..49173e01 100644 --- a/src/main/java/com/jmal/clouddisk/media/VideoProcessService.java +++ b/src/main/java/com/jmal/clouddisk/media/VideoProcessService.java @@ -476,7 +476,7 @@ public VideoInfo getVideoCover(String fileId, String username, String relativePa * @param fileId fileId * @return 视频文件缓存目录 */ - private String getVideoCacheDir(String username, String fileId) { + public String getVideoCacheDir(String username, String fileId) { // 视频文件缓存目录 String videoCacheDir = Paths.get(fileProperties.getRootDir(), fileProperties.getChunkFileDir(), username, fileProperties.getVideoTranscodeCache(), fileId).toString(); if (!FileUtil.exist(videoCacheDir)) { diff --git a/src/main/java/com/jmal/clouddisk/model/FileDocument.java b/src/main/java/com/jmal/clouddisk/model/FileDocument.java index 6dd3d16c..ccd225b7 100644 --- a/src/main/java/com/jmal/clouddisk/model/FileDocument.java +++ b/src/main/java/com/jmal/clouddisk/model/FileDocument.java @@ -196,6 +196,11 @@ public class FileDocument extends FileBase { private Boolean move; + /** + * 是否显示封面, 缩略图保存在content中 + */ + private Boolean showCover; + /** * 操作权限 */ diff --git a/src/main/java/com/jmal/clouddisk/model/FileIntroVO.java b/src/main/java/com/jmal/clouddisk/model/FileIntroVO.java index bcbef1f2..446c0156 100644 --- a/src/main/java/com/jmal/clouddisk/model/FileIntroVO.java +++ b/src/main/java/com/jmal/clouddisk/model/FileIntroVO.java @@ -92,6 +92,10 @@ public class FileIntroVO extends FileBase { * 封面 */ private String cover; + /*** + * 是否显示封面, 缩略图保存在content中 + */ + private Boolean showCover; /** * 挂载的文件id */ diff --git a/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java b/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java index 177b4b78..ff5077cb 100644 --- a/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java +++ b/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java @@ -1103,4 +1103,18 @@ public void deleteDocWithDeleteFlag() { log.info("删除有删除标记的文档: {}", deleteResult.getDeletedCount()); } } + + /** + * 更新文件封面 + * @param fileId 文件Id + * @param coverFile 封面文件 + */ + public void updateCoverFileDocument(String fileId, File coverFile) { + Query query = new Query(); + query.addCriteria(Criteria.where("_id").is(new ObjectId(fileId))); + Update update = new Update(); + generateThumbnail(coverFile, update); + update.set("showCover", true); + mongoTemplate.updateFirst(query, update, COLLECTION_NAME); + } } diff --git a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java index dc951118..68914cf0 100644 --- a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java +++ b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java @@ -1,56 +1,59 @@ package com.jmal.clouddisk.util; import lombok.extern.slf4j.Slf4j; -import org.apache.poi.xslf.usermodel.XMLSlideShow; -import org.apache.poi.xslf.usermodel.XSLFShape; -import org.apache.poi.xslf.usermodel.XSLFSlide; -import org.apache.poi.xslf.usermodel.XSLFTextShape; -import org.apache.poi.xwpf.usermodel.XWPFDocument; -import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import javax.imageio.ImageIO; +import java.awt.*; +import java.awt.image.BufferedImage; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.util.List; @Slf4j public class FileContentUtil { - public static String readPPTContent(File file) { - try (FileInputStream fis = new FileInputStream(file.getAbsolutePath()); - XMLSlideShow ppt = new XMLSlideShow(fis)) { - StringBuilder stringBuilder = new StringBuilder(); - for (XSLFSlide slide : ppt.getSlides()) { - for (XSLFShape shape : slide.getShapes()) { - if (shape instanceof XSLFTextShape textShape) { - stringBuilder.append(textShape.getText()); - } - } - } - return stringBuilder.toString(); - } catch (IOException e) { - readFailed(file, e); - } - return null; + public static void readFailed(File file, IOException e) { + log.warn("读取文件内容失败, file: {}, {}", file.getAbsolutePath(), e.getMessage()); } - public static String readWordContent(File file) { - try (FileInputStream fis = new FileInputStream(file.getAbsolutePath()); - XWPFDocument document = new XWPFDocument(fis)) { - StringBuilder stringBuilder = new StringBuilder(); - List paragraphs = document.getParagraphs(); - for (XWPFParagraph para : paragraphs) { - stringBuilder.append(para.getText()); + public static File pdfCoverImage(PDDocument document, String outputPath) { + try { + PDFRenderer pdfRenderer = new PDFRenderer(document); + int pageIndex = 0; + int dpi = 150; + + // 渲染第一页的图像,获取图像的尺寸 + BufferedImage tempImage = pdfRenderer.renderImageWithDPI(pageIndex, dpi, ImageType.RGB); + int width = tempImage.getWidth(); + int height = tempImage.getHeight(); + tempImage.flush(); + + // 创建用于逐块渲染的BufferedImage + BufferedImage coverImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + Graphics2D graphics = coverImage.createGraphics(); + + // 设置渲染块的大小 + int blockSize = 100; + for (int y = 0; y < height; y += blockSize) { + int blockHeight = Math.min(blockSize, height - y); + Rectangle block = new Rectangle(0, y, width, blockHeight); + BufferedImage blockImage = pdfRenderer.renderImage(pageIndex); + graphics.drawImage(blockImage, 0, y, null); + blockImage.flush(); } - return stringBuilder.toString(); - } catch (IOException e) { - readFailed(file, e); - } - return null; - } + graphics.dispose(); - public static void readFailed(File file, IOException e) { - log.warn("读取文件内容失败, file: {}, {}", file.getAbsolutePath(), e.getMessage()); + // 将封面图像保存为JPEG文件 + File coverImageFile = new File(outputPath, "cover.jpg"); + ImageIO.write(coverImage, "JPEG", coverImageFile); + + coverImage.flush(); + return coverImageFile; + } catch (Exception e) { + return null; + } } } diff --git a/src/main/java/com/jmal/clouddisk/util/MyFileUtils.java b/src/main/java/com/jmal/clouddisk/util/MyFileUtils.java index 04d0d890..26f08828 100644 --- a/src/main/java/com/jmal/clouddisk/util/MyFileUtils.java +++ b/src/main/java/com/jmal/clouddisk/util/MyFileUtils.java @@ -22,7 +22,7 @@ @Slf4j public class MyFileUtils { - public static List hasContentTypes = Arrays.asList("pdf", "drawio", "mind", "doc", "docx", "xls", "xlsx", "xlsm", "ppt", "pptx", "csv", "tsv", "dotm", "xlt", "xltm", "dot", "dotx", "xlam", "xla", "pages"); + public static List hasContentTypes = Arrays.asList("pdf", "drawio", "mind", "doc", "docx", "xls", "xlsx", "xlsm", "ppt", "pptx", "csv", "tsv", "dotm", "xlt", "xltm", "dot", "dotx", "xlam", "xla", "pages", "epub"); private MyFileUtils(){ diff --git a/src/test/java/com/jmal/clouddisk/EpubCoverTest.java b/src/test/java/com/jmal/clouddisk/EpubCoverTest.java new file mode 100644 index 00000000..6ae559df --- /dev/null +++ b/src/test/java/com/jmal/clouddisk/EpubCoverTest.java @@ -0,0 +1,94 @@ +package com.jmal.clouddisk; + +import cn.hutool.core.date.TimeInterval; +import nl.siegmann.epublib.domain.Book; +import nl.siegmann.epublib.domain.Resource; +import nl.siegmann.epublib.domain.Spine; +import nl.siegmann.epublib.epub.EpubReader; +import org.apache.tika.exception.TikaException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; + +public class EpubCoverTest { + public static void main(String[] args) { + try { + TimeInterval timeInterval = new TimeInterval(); + File epubFile = new File("/Users/jmal/Downloads/《延展力》未来职场的创造性重构与自我迭代.epub"); + File coverImageFile = epubCoverImage(epubFile); + System.out.println("epub 提取封面图像耗时: " + timeInterval.intervalMs() + "ms"); + System.out.println("epub 封面图像已保存为: " + coverImageFile); + + // 读取 EPUB 内容 + String content = readEpubContent(epubFile); + System.out.println("epub 内容: " + content); + } catch (IOException | TikaException e) { + System.out.println("epub 提取封面图像失败: " + e.getMessage()); + } + } + + private static File epubCoverImage(File epubFile) throws IOException, TikaException { + try { + // 打开 EPUB 文件 + FileInputStream fileInputStream = new FileInputStream(epubFile); + EpubReader epubReader = new EpubReader(); + Book book = epubReader.readEpub(fileInputStream); + + // 获取封面图片 + Resource coverImage = book.getCoverImage(); + if (coverImage != null) { + InputStream coverImageInputStream = coverImage.getInputStream(); + File coverImageFile = Paths.get(epubFile.getParent(), epubFile.getName().replace(".epub", "_cover.jpg")).toFile(); + OutputStream coverImageOutput = new FileOutputStream(coverImageFile); + + byte[] buffer = new byte[1024]; + int bytesRead; + while ((bytesRead = coverImageInputStream.read(buffer)) != -1) { + coverImageOutput.write(buffer, 0, bytesRead); + } + coverImageInputStream.close(); + coverImageOutput.close(); + return coverImageFile; + } else { + System.out.println("EPUB 文件中没有封面"); + } + fileInputStream.close(); + } catch (Exception e) { + e.printStackTrace(); + } + + return null; + } + + private static String readEpubContent(File epubFile) { + try { + // 打开 EPUB 文件 + InputStream fileInputStream = new FileInputStream(epubFile); + EpubReader epubReader = new EpubReader(); + Book book = epubReader.readEpub(fileInputStream); + StringBuilder content = new StringBuilder(); + // 获取章节内容 + Spine spine = book.getSpine(); + for (int i = 0; i < spine.size(); i++) { + Resource resource = spine.getResource(i); + InputStream is = resource.getInputStream(); + byte[] bytes = is.readAllBytes(); + String htmlContent = new String(bytes, StandardCharsets.UTF_8); + // 使用 JSoup 解析 HTML 并提取纯文本 + Document document = Jsoup.parse(htmlContent); + String textContent = document.text(); + content.append(textContent); + is.close(); + } + fileInputStream.close(); + return content.toString(); + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + +} diff --git a/src/test/java/com/jmal/clouddisk/PdfCoverTest.java b/src/test/java/com/jmal/clouddisk/PdfCoverTest.java new file mode 100644 index 00000000..e819c6bc --- /dev/null +++ b/src/test/java/com/jmal/clouddisk/PdfCoverTest.java @@ -0,0 +1,41 @@ +package com.jmal.clouddisk; + +import cn.hutool.core.date.TimeInterval; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBufferedFile; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.tika.exception.TikaException; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; + +public class PdfCoverTest { + public static void main(String[] args) { + try { + TimeInterval timeInterval = new TimeInterval(); + File pdfFile = new File("/Users/jmal/temp/filetest/rootpath/jmal/jmal/1001241328 JLG BIM Content User Guide - Boom Lift.pdf"); + File coverImageFile = pdfCoverImage(pdfFile); + System.out.println("pdf 提取封面图像耗时: " + timeInterval.intervalMs() + "ms"); + System.out.println("pdf 封面图像已保存为: " + coverImageFile); + } catch (IOException | TikaException e) { + System.out.println("pdf 提取封面图像失败: " + e.getMessage()); + } + } + + private static File pdfCoverImage(File pdfFile) throws IOException, TikaException { + // 使用PDFBox读取PDF并提取封面图像 + try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(pdfFile))) { + PDFRenderer pdfRenderer = new PDFRenderer(document); + BufferedImage coverImage = pdfRenderer.renderImageWithDPI(0, 300, ImageType.RGB); + + // 将封面图像保存为JPEG文件 + File coverImageFile = new File(pdfFile.getParent(), pdfFile.getName().replace(".pdf", "_cover.jpg")); + ImageIO.write(coverImage, "JPEG", coverImageFile); + return coverImageFile; + } + } +} diff --git a/src/test/java/com/jmal/clouddisk/WordCoverTest.java b/src/test/java/com/jmal/clouddisk/WordCoverTest.java new file mode 100644 index 00000000..7e5f4e60 --- /dev/null +++ b/src/test/java/com/jmal/clouddisk/WordCoverTest.java @@ -0,0 +1,89 @@ +package com.jmal.clouddisk; + +import cn.hutool.core.date.TimeInterval; +import com.itextpdf.styledxmlparser.jsoup.Jsoup; +import com.itextpdf.styledxmlparser.jsoup.nodes.Element; +import com.itextpdf.styledxmlparser.jsoup.select.Elements; +import com.spire.doc.Document; +import com.spire.doc.FileFormat; +import com.spire.doc.documents.ImageType; +import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.codec.binary.Base64; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFPictureData; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.*; +import java.util.List; + +@Slf4j +public class WordCoverTest { + public static void main(String[] args) { + try { + TimeInterval timeInterval = new TimeInterval(); + File pdfFile = new File("/Users/jmal/Downloads/未命名文件1-2.docx"); + File coverImageFile = wordCoverImage(pdfFile); + System.out.println("word 提取封面图像耗时: " + timeInterval.intervalMs() + "ms"); + System.out.println("word 封面图像已保存为: " + coverImageFile); + timeInterval.restart(); + String html = docxToHtml(pdfFile); + System.out.println("word 转换为html: " + html); + System.out.println("word 转换为html耗时: " + timeInterval.intervalMs() + "ms"); + } catch (IOException e) { + System.out.println("word 提取封面图像失败: " + e.getMessage()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private static File wordCoverImage(File wordFile) throws IOException { + try (FileInputStream fis = new FileInputStream(wordFile.getAbsolutePath())) { + Document document = new Document(); + document.loadFromStream(fis, FileFormat.Docx); + if (document.getPageCount() > 0) { + // 获取第1页并转换为图片 + BufferedImage image = document.saveToImages(0, ImageType.Bitmap); + // 将图片写入输出流 + // ByteArrayOutputStream imageOutputStream = new ByteArrayOutputStream(); + // ImageIO.write(image, "png", imageOutputStream); + // 保存图片 + File coverImageFile = new File(wordFile.getParent(), wordFile.getName().replace(".docx", "_cover.png")); + ImageIO.write(image, "png", coverImageFile); + return coverImageFile; + } + } + return null; + } + + public static String docxToHtml(File wordFile) throws Exception { + InputStream input = new FileInputStream(wordFile); + XWPFDocument document = new XWPFDocument(input); + List list = document.getAllPictures(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + XHTMLConverter.getInstance().convert(document, outputStream, null); + String s = outputStream.toString(); + return setImg(s, list); + } + + private static String setImg(String html, List list) { + com.itextpdf.styledxmlparser.jsoup.nodes.Document doc = Jsoup.parse(html); + Elements elements = doc.getElementsByTag("img"); + if (elements != null && !elements.isEmpty() && list != null) { + for (Element element : elements) { + String src = element.attr("src"); + for (XWPFPictureData data : list) { + if (src.contains(data.getFileName())) { + String type = src.substring(src.lastIndexOf(".") + 1); + String base64 = "data:image/" + type + ";base64," + new String(Base64.encodeBase64(data.getData())); + element.attr("src", base64); + break; + } + } + } + } + return doc.toString(); + } + +} diff --git a/src/test/java/com/jmal/clouddisk/lucene/ReadPDFTest.java b/src/test/java/com/jmal/clouddisk/lucene/ReadPDFTest.java index 64989e53..f6545f3a 100644 --- a/src/test/java/com/jmal/clouddisk/lucene/ReadPDFTest.java +++ b/src/test/java/com/jmal/clouddisk/lucene/ReadPDFTest.java @@ -14,7 +14,7 @@ public class ReadPDFTest { @Autowired - private ReadPDFContentService readPDFContentService; + private ReadContentService readContentService; @Test public void testImagePDF() { @@ -29,7 +29,7 @@ public void testImagePDF() { assertNotNull(file, "File should not be null"); assertTrue(file.exists(), "File should exist"); - String content = readPDFContentService.read(file); + String content = readContentService.readPdfContent(file, null); assertNotNull(content, "Content should not be null"); assertFalse(content.isEmpty(), "Content should not be empty"); @@ -44,7 +44,7 @@ public void testTextPDF() { assertNotNull(file, "File should not be null"); assertTrue(file.exists(), "File should exist"); - String content = readPDFContentService.read(file); + String content = readContentService.readPdfContent(file, null); assertNotNull(content, "Content should not be null"); assertFalse(content.isEmpty(), "Content should not be empty"); From 4226929c18f70e255699d05c018f4bf080243aef Mon Sep 17 00:00:00 2001 From: jmal Date: Wed, 31 Jul 2024 18:39:23 +0800 Subject: [PATCH 2/4] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9Epdf=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=B0=81=E9=9D=A2=E9=A2=84=E8=A7=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/jmal/clouddisk/lucene/ReadContentService.java | 2 +- src/main/java/com/jmal/clouddisk/util/FileContentUtil.java | 2 -- src/test/java/com/jmal/clouddisk/EpubCoverTest.java | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java index c6f905cb..523fb101 100644 --- a/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java +++ b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java @@ -107,7 +107,7 @@ public String readPdfContent(File file, String fileId) { } public String readEpubContent(File file) { - try (InputStream fileInputStream = new FileInputStream(file);) { + try (InputStream fileInputStream = new FileInputStream(file)) { // 打开 EPUB 文件 EpubReader epubReader = new EpubReader(); Book book = epubReader.readEpub(fileInputStream); diff --git a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java index 68914cf0..aa55b125 100644 --- a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java +++ b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java @@ -37,8 +37,6 @@ public static File pdfCoverImage(PDDocument document, String outputPath) { // 设置渲染块的大小 int blockSize = 100; for (int y = 0; y < height; y += blockSize) { - int blockHeight = Math.min(blockSize, height - y); - Rectangle block = new Rectangle(0, y, width, blockHeight); BufferedImage blockImage = pdfRenderer.renderImage(pageIndex); graphics.drawImage(blockImage, 0, y, null); blockImage.flush(); diff --git a/src/test/java/com/jmal/clouddisk/EpubCoverTest.java b/src/test/java/com/jmal/clouddisk/EpubCoverTest.java index 6ae559df..5aa82f5a 100644 --- a/src/test/java/com/jmal/clouddisk/EpubCoverTest.java +++ b/src/test/java/com/jmal/clouddisk/EpubCoverTest.java @@ -57,7 +57,7 @@ private static File epubCoverImage(File epubFile) throws IOException, TikaExcept } fileInputStream.close(); } catch (Exception e) { - e.printStackTrace(); + System.out.println("EPUB 文件读取失败: " + e.getMessage()); } return null; @@ -86,7 +86,7 @@ private static String readEpubContent(File epubFile) { fileInputStream.close(); return content.toString(); } catch (Exception e) { - e.printStackTrace(); + System.out.println("EPUB 文件读取失败: " + e.getMessage()); } return null; } From 086bd2fbbbe2825bcc96e5bbec97525687d8493d Mon Sep 17 00:00:00 2001 From: jmal Date: Thu, 1 Aug 2024 16:14:51 +0800 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9Eepub=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=B0=81=E9=9D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- docker-compose.base-aliyun-beijing.yml | 2 +- docker-compose.base-aliyun-guangzhou.yml | 2 +- docker-compose.base.proxy.yml | 2 +- docker-compose.base.yml | 2 +- pom.xml | 22 +++++++ .../controller/rest/FileController.java | 8 +-- .../controller/rest/ShareController.java | 16 ++--- .../jmal/clouddisk/lucene/LuceneService.java | 2 +- .../clouddisk/lucene/ReadContentService.java | 16 +++-- .../jmal/clouddisk/service/IFileService.java | 6 +- .../service/impl/CommonFileService.java | 3 + .../service/impl/FileServiceImpl.java | 17 ++++- .../jmal/clouddisk/util/FileContentUtil.java | 64 ++++++++++--------- src/main/resources/file.yml | 2 +- .../java/com/jmal/clouddisk/PdfCoverTest.java | 4 +- 16 files changed, 111 insertions(+), 59 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2e35f49f..21d11544 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ ARG VERSION ENV MONGODB_URI "mongodb://mongo:27017/jmalcloud" ENV RUN_ENVIRONMENT prod -ENV JVM_OPTS "-Xms50m -Xmx512m" +ENV JVM_OPTS "-Xms256m -Xmx1024m" ENV LOG_LEVEL warn ENV FILE_MONITOR true diff --git a/docker-compose.base-aliyun-beijing.yml b/docker-compose.base-aliyun-beijing.yml index 107e33f6..088e75fb 100644 --- a/docker-compose.base-aliyun-beijing.yml +++ b/docker-compose.base-aliyun-beijing.yml @@ -21,7 +21,7 @@ services: environment: MONGODB_URI: "mongodb://mongo:27017/jmalcloud" TZ: "Asia/Shanghai" - JVM_OPTS: "-Xms256m -Xmx512m" + JVM_OPTS: "-Xms256m -Xmx1024m" volumes: - ./docker/jmalcloud/files:/jmalcloud/files/ restart: unless-stopped diff --git a/docker-compose.base-aliyun-guangzhou.yml b/docker-compose.base-aliyun-guangzhou.yml index 9e39416c..6179d4b0 100644 --- a/docker-compose.base-aliyun-guangzhou.yml +++ b/docker-compose.base-aliyun-guangzhou.yml @@ -21,7 +21,7 @@ services: environment: MONGODB_URI: "mongodb://mongo:27017/jmalcloud" TZ: "Asia/Shanghai" - JVM_OPTS: "-Xms256m -Xmx512m" + JVM_OPTS: "-Xms256m -Xmx1024m" volumes: - ./docker/jmalcloud/files:/jmalcloud/files/ restart: unless-stopped diff --git a/docker-compose.base.proxy.yml b/docker-compose.base.proxy.yml index 0421cb36..0c63690e 100644 --- a/docker-compose.base.proxy.yml +++ b/docker-compose.base.proxy.yml @@ -21,7 +21,7 @@ services: environment: MONGODB_URI: "mongodb://mongo:27017/jmalcloud" TZ: "Asia/Shanghai" - JVM_OPTS: "-Xms256m -Xmx512m" + JVM_OPTS: "-Xms256m -Xmx1024m" volumes: - ./docker/jmalcloud/files:/jmalcloud/files/ restart: unless-stopped diff --git a/docker-compose.base.yml b/docker-compose.base.yml index a2751f23..bd718e63 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -21,7 +21,7 @@ services: environment: MONGODB_URI: "mongodb://mongo:27017/jmalcloud" TZ: "Asia/Shanghai" - JVM_OPTS: "-Xms256m -Xmx512m" + JVM_OPTS: "-Xms256m -Xmx1024m" volumes: - ./docker/jmalcloud/files:/jmalcloud/files/ restart: unless-stopped diff --git a/pom.xml b/pom.xml index 44b8afcb..12a6c57d 100644 --- a/pom.xml +++ b/pom.xml @@ -74,6 +74,16 @@ org.springframework.boot spring-boot-starter-data-mongodb + + org.springframework.boot + spring-boot-starter-logging + + + org.slf4j + slf4j-simple + + + org.springframework.boot spring-boot-starter-web @@ -122,6 +132,12 @@ net.sourceforge.tess4j tess4j 5.11.0 + + + commons-logging + commons-logging + + @@ -251,6 +267,12 @@ nl.siegmann.epublib epublib-core 3.1 + + + org.slf4j + slf4j-simple + + org.jsoup diff --git a/src/main/java/com/jmal/clouddisk/controller/rest/FileController.java b/src/main/java/com/jmal/clouddisk/controller/rest/FileController.java index a14fc23a..f0ac8675 100644 --- a/src/main/java/com/jmal/clouddisk/controller/rest/FileController.java +++ b/src/main/java/com/jmal/clouddisk/controller/rest/FileController.java @@ -247,12 +247,12 @@ public void packageDownload(HttpServletRequest request, HttpServletResponse resp @GetMapping("/view/thumbnail") @Permission("cloud:file:list") @LogOperatingFun(logType = LogOperation.Type.BROWSE) - public ResponseEntity thumbnail(@RequestParam String id) { + public ResponseEntity thumbnail(@RequestParam String id, Boolean showCover) { String ossPath = CaffeineUtil.getOssPath(Paths.get(id)); if (ossPath != null) { return webOssService.thumbnail(ossPath, id); } - Optional file = fileService.thumbnail(id); + Optional file = fileService.thumbnail(id, showCover); return file.map(fileService::getObjectResponseEntity).orElseGet(() -> ResponseEntity.status(HttpStatus.NOT_FOUND).body("找不到该文件")); } @@ -260,8 +260,8 @@ public ResponseEntity thumbnail(@RequestParam String id) { @GetMapping("/view/thumbnail/{filename}") @Permission("cloud:file:list") @LogOperatingFun(logType = LogOperation.Type.BROWSE) - public ResponseEntity thumbnailName(@RequestParam String id) { - return thumbnail(id); + public ResponseEntity thumbnailName(@RequestParam String id, Boolean showCover) { + return thumbnail(id, showCover); } @Operation(summary = "显示缩略图(媒体封面)") diff --git a/src/main/java/com/jmal/clouddisk/controller/rest/ShareController.java b/src/main/java/com/jmal/clouddisk/controller/rest/ShareController.java index a0db7998..39d91def 100644 --- a/src/main/java/com/jmal/clouddisk/controller/rest/ShareController.java +++ b/src/main/java/com/jmal/clouddisk/controller/rest/ShareController.java @@ -180,22 +180,22 @@ public void publicPackageDownloadOne(HttpServletRequest request, HttpServletResp @Operation(summary = "显示缩略图") @GetMapping("/articles/s/view/thumbnail") @LogOperatingFun(logType = LogOperation.Type.BROWSE) - public ResponseEntity articlesThumbnail(String id) { - return thumbnail(id, null); + public ResponseEntity articlesThumbnail(String id, Boolean showCover) { + return thumbnail(id, showCover, null); } @Operation(summary = "显示缩略图") @GetMapping("/public/s/view/thumbnail") @LogOperatingFun(logType = LogOperation.Type.BROWSE) - public ResponseEntity publicThumbnail(String id, HttpServletRequest request) { - return thumbnail(id, request); + public ResponseEntity publicThumbnail(String id, Boolean showCover, HttpServletRequest request) { + return thumbnail(id, showCover, request); } @Operation(summary = "显示缩略图") @GetMapping("/public/s/view/thumbnail/{filename}") @LogOperatingFun(logType = LogOperation.Type.BROWSE) - public ResponseEntity publicThumbnailName(String id, HttpServletRequest request) { - return publicThumbnail(id, request); + public ResponseEntity publicThumbnailName(String id, Boolean showCover, HttpServletRequest request) { + return publicThumbnail(id, showCover, request); } @Operation(summary = "显示缩略图(媒体封面)") @@ -207,9 +207,9 @@ public ResponseEntity coverOfMedia(String id, String name) { return file.map(fileService::getObjectResponseEntity).orElseGet(() -> ResponseEntity.status(HttpStatus.NOT_FOUND).body("找不到该文件")); } - private ResponseEntity thumbnail(String id, HttpServletRequest request) { + private ResponseEntity thumbnail(String id, Boolean showCover, HttpServletRequest request) { ResultUtil.checkParamIsNull(id); - Optional file = fileService.thumbnail(id); + Optional file = fileService.thumbnail(id, showCover); if (fileInterceptor.isNotAllowAccess(file.orElse(null), request)) { return null; } diff --git a/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java b/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java index e5510db6..73768aab 100644 --- a/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java +++ b/src/main/java/com/jmal/clouddisk/lucene/LuceneService.java @@ -346,7 +346,7 @@ private String readFileContent(File file, String fileId) { return readContentService.readPdfContent(file, fileId); } if ("epub".equals(type)) { - return readContentService.readEpubContent(file); + return readContentService.readEpubContent(file, fileId); } if ("ppt".equals(type) || "pptx".equals(type)) { return readContentService.readPPTContent(file); diff --git a/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java index 523fb101..59d21a4c 100644 --- a/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java +++ b/src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java @@ -59,9 +59,9 @@ public String readPdfContent(File file, String fileId) { String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath())); - if (StrUtil.isNotBlank(fileId)) { - // 生成pdf封面图像 - File coverFile = FileContentUtil.pdfCoverImage(document, videoProcessService.getVideoCacheDir(username, fileId)); + // 生成封面图像 + if (StrUtil.isNotBlank(fileId)) { + File coverFile = FileContentUtil.pdfCoverImage(file, document, videoProcessService.getVideoCacheDir(username, fileId)); commonFileService.updateCoverFileDocument(fileId, coverFile); } @@ -106,11 +106,19 @@ public String readPdfContent(File file, String fileId) { return null; } - public String readEpubContent(File file) { + public String readEpubContent(File file, String fileId) { try (InputStream fileInputStream = new FileInputStream(file)) { // 打开 EPUB 文件 EpubReader epubReader = new EpubReader(); Book book = epubReader.readEpub(fileInputStream); + + // 生成封面图像 + String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath())); + if (StrUtil.isNotBlank(fileId)) { + File coverFile = FileContentUtil.epubCoverImage(book, videoProcessService.getVideoCacheDir(username, fileId)); + commonFileService.updateCoverFileDocument(fileId, coverFile); + } + StringBuilder content = new StringBuilder(); // 获取章节内容 Spine spine = book.getSpine(); diff --git a/src/main/java/com/jmal/clouddisk/service/IFileService.java b/src/main/java/com/jmal/clouddisk/service/IFileService.java index ae39f624..0dab3cdc 100644 --- a/src/main/java/com/jmal/clouddisk/service/IFileService.java +++ b/src/main/java/com/jmal/clouddisk/service/IFileService.java @@ -178,10 +178,12 @@ public interface IFileService { /** * 显示缩略图 - * @param id fileId + * + * @param id fileId + * @param showCover 是否显示封面 * @return FileDocument */ - Optional thumbnail(String id); + Optional thumbnail(String id, Boolean showCover); /** * 显示缩略图(媒体文件封面) diff --git a/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java b/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java index ff5077cb..ce022bd0 100644 --- a/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java +++ b/src/main/java/com/jmal/clouddisk/service/impl/CommonFileService.java @@ -1110,6 +1110,9 @@ public void deleteDocWithDeleteFlag() { * @param coverFile 封面文件 */ public void updateCoverFileDocument(String fileId, File coverFile) { + if (coverFile == null || !coverFile.exists()) { + return; + } Query query = new Query(); query.addCriteria(Criteria.where("_id").is(new ObjectId(fileId))); Update update = new Update(); diff --git a/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java b/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java index 9c0b3742..d8faa40c 100644 --- a/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java +++ b/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java @@ -50,6 +50,7 @@ import org.mozilla.universalchardet.ReaderFactory; import org.springframework.beans.BeanUtils; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.data.domain.Sort; import org.springframework.data.mongodb.core.query.Criteria; import org.springframework.data.mongodb.core.query.Query; @@ -109,6 +110,9 @@ public class FileServiceImpl extends CommonFileService implements IFileService { LuceneService luceneService; private static final AES aes = SecureUtil.aes(); + @Qualifier("commonFileService") + @Autowired + private CommonFileService commonFileService; @Override public ResponseResult listFiles(UploadApiParamDTO upload) throws CommonException { @@ -624,11 +628,20 @@ private static String removeAnsiCodes(String text) { } @Override - public Optional thumbnail(String id) { + public Optional thumbnail(String id, Boolean showCover) { FileDocument fileDocument = mongoTemplate.findById(id, FileDocument.class, COLLECTION_NAME); if (fileDocument != null) { + + String username = userService.getUserNameById(fileDocument.getUserId()); + + if (BooleanUtil.isTrue(showCover) && BooleanUtil.isTrue(fileDocument.getShowCover())) { + File file = FileContentUtil.getCoverPath(videoProcessService.getVideoCacheDir(username,id)); + if (file.exists()) { + fileDocument.setContent(FileUtil.readBytes(file)); + return Optional.of(fileDocument); + } + } if (fileDocument.getContent() == null) { - String username = userService.getUserNameById(fileDocument.getUserId()); String currentDirectory = getUserDirectory(fileDocument.getPath()); File file = new File(fileProperties.getRootDir() + File.separator + username + currentDirectory + fileDocument.getName()); if (file.exists()) { diff --git a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java index aa55b125..122c6847 100644 --- a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java +++ b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java @@ -1,15 +1,15 @@ package com.jmal.clouddisk.util; import lombok.extern.slf4j.Slf4j; +import nl.siegmann.epublib.domain.Book; +import nl.siegmann.epublib.domain.Resource; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import javax.imageio.ImageIO; -import java.awt.*; import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; +import java.io.*; @Slf4j public class FileContentUtil { @@ -18,40 +18,44 @@ public static void readFailed(File file, IOException e) { log.warn("读取文件内容失败, file: {}, {}", file.getAbsolutePath(), e.getMessage()); } - public static File pdfCoverImage(PDDocument document, String outputPath) { + public static File getCoverPath(String outputPath) { + return new File(outputPath, "cover.jpg"); + } + + public static File pdfCoverImage(File file, PDDocument document, String outputPath) { try { PDFRenderer pdfRenderer = new PDFRenderer(document); - int pageIndex = 0; - int dpi = 150; - - // 渲染第一页的图像,获取图像的尺寸 - BufferedImage tempImage = pdfRenderer.renderImageWithDPI(pageIndex, dpi, ImageType.RGB); - int width = tempImage.getWidth(); - int height = tempImage.getHeight(); - tempImage.flush(); - - // 创建用于逐块渲染的BufferedImage - BufferedImage coverImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); - Graphics2D graphics = coverImage.createGraphics(); - - // 设置渲染块的大小 - int blockSize = 100; - for (int y = 0; y < height; y += blockSize) { - BufferedImage blockImage = pdfRenderer.renderImage(pageIndex); - graphics.drawImage(blockImage, 0, y, null); - blockImage.flush(); - } - graphics.dispose(); - + BufferedImage coverImage = pdfRenderer.renderImageWithDPI(0, 128, ImageType.RGB); // 将封面图像保存为JPEG文件 - File coverImageFile = new File(outputPath, "cover.jpg"); + File coverImageFile = getCoverPath(outputPath); ImageIO.write(coverImage, "JPEG", coverImageFile); - - coverImage.flush(); return coverImageFile; - } catch (Exception e) { + } catch (Throwable e) { + log.warn("PDF 文件封面图像生成失败: {}, file: {}", e.getMessage(), file.getName()); return null; } } + public static File epubCoverImage(Book book, String outputPath) { + // 获取封面图片 + Resource coverImage = book.getCoverImage(); + if (coverImage != null) { + File coverImageFile = getCoverPath(outputPath); + try (InputStream coverImageInputStream = coverImage.getInputStream(); + OutputStream coverImageOutput = new FileOutputStream(coverImageFile);) { + byte[] buffer = new byte[1024]; + int bytesRead; + while ((bytesRead = coverImageInputStream.read(buffer)) != -1) { + coverImageOutput.write(buffer, 0, bytesRead); + } + return coverImageFile; + } catch (Throwable e) { + log.warn("epub 文件封面图像生成失败: {}", e.getMessage()); + return null; + } + } + log.warn("epub 文件封面图像生成失败: 未找到封面图片"); + return null; + } + } diff --git a/src/main/resources/file.yml b/src/main/resources/file.yml index af64a051..5231670a 100644 --- a/src/main/resources/file.yml +++ b/src/main/resources/file.yml @@ -22,7 +22,7 @@ file: # 文本类型 simText: [ drawio, mind, txt, html, htm, xhtml, css, less, sass, scss, js, ts, jsx, tsx, json, xml, csv, tsv, md, markdown, rst, yaml, yml, ini, toml, cfg, conf, log, bat, cmd, sh, bash, zsh, ps1, py, pyw, pyc, pyo, pyd, rb, erb, pl, pm, t, php, phtml, phps, java, jsp, jspx, jsf, jws, jsp, jtpl, scala, kt, kts, groovy, gvy, gy, gsh, swift, c, cc, cpp, cxx, h, hh, hpp, hxx, cs, csx, vb, fs, fsx, fsi, ml, mli, go, rs, rlib, d, asm, s, sql, pgsql, psql, plpgsql, pls, plb, plsql, sqlite, db, dbf, mdb, accdb, cbl, cob, cpy, tcl, tk, lua, hs, erl, hrl, ex, exs, clj, cljs, edn, lisp, lsp, scm, rkt, ss, sml, v, sv, svh, vhd, vhdl, ino, pde, bsv, f, f90, f95, f03, f08, for, f77, f18, vba, vb, bas, cls, frm, frx, tex, latex, ltx, bib, bbl, sty, cls, dtx, ins, rst, rest, org, asciidoc, adoc, asc, pod, pov, mmd, mn, muse, creole, wiki, dokuwiki, vimwiki, haddock, jsdoc, pydoc, rdoc, yard, doxygen, roxygen2, javadoc, xmldoc, html, xhtml, htm, shtm, shtml, mht, mhtml, hdml, tpl, tmpl, vue, ejs, hbs, haml, pug, jade, slim, mustache, handlebars, nunjucks, liquid, jinja2, jinja, jnj, j2, njk, twig, swig, poi, t4, tt, tt2, tpl, eta, ect, coffee, litcoffee, dart, diff, patch, hs, x, xi, xmi, xaml, kml, wsdl, plist, list, nfo, srt, sub, sbv, vtt, bml, mrl, irl, log, changelog, CHANGELOG, license, LICENCE, LICENSE, copying, COPYING, readme, README, todo, TODO, contributing, CONTRIBUTING, authors, AUTHORS, dockerfile, Dockerfile, code-workspace, jsconfig, tsconfig, jshintrc, jscsrc, eslintrc, eslintignore, babelrc, browserconfig, webmanifest, htaccess, gitlab-ci, travis, circleci, jenkinsfile, prettierrc, stylelintrc, lintstagedrc, commitlintrc ] # 文档类型 - document: [ "pdf", "doc", "docx", "xlsx", "xls", "xl", "md", "ppt", "pptx" ] + document: [ "pdf", "doc", "docx", "xlsx", "xls", "xl", "md", "ppt", "pptx", "epub" ] # webDAV协议前缀 web-dav-prefix: webDAV # ip2region.xdb path diff --git a/src/test/java/com/jmal/clouddisk/PdfCoverTest.java b/src/test/java/com/jmal/clouddisk/PdfCoverTest.java index e819c6bc..1f02647f 100644 --- a/src/test/java/com/jmal/clouddisk/PdfCoverTest.java +++ b/src/test/java/com/jmal/clouddisk/PdfCoverTest.java @@ -17,7 +17,7 @@ public class PdfCoverTest { public static void main(String[] args) { try { TimeInterval timeInterval = new TimeInterval(); - File pdfFile = new File("/Users/jmal/temp/filetest/rootpath/jmal/jmal/1001241328 JLG BIM Content User Guide - Boom Lift.pdf"); + File pdfFile = new File("/Users/jmal/Downloads/1001241328 JLG BIM Content User Guide - Boom Lift (1).pdf"); File coverImageFile = pdfCoverImage(pdfFile); System.out.println("pdf 提取封面图像耗时: " + timeInterval.intervalMs() + "ms"); System.out.println("pdf 封面图像已保存为: " + coverImageFile); @@ -30,7 +30,7 @@ private static File pdfCoverImage(File pdfFile) throws IOException, TikaExceptio // 使用PDFBox读取PDF并提取封面图像 try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(pdfFile))) { PDFRenderer pdfRenderer = new PDFRenderer(document); - BufferedImage coverImage = pdfRenderer.renderImageWithDPI(0, 300, ImageType.RGB); + BufferedImage coverImage = pdfRenderer.renderImageWithDPI(0, 128, ImageType.RGB); // 将封面图像保存为JPEG文件 File coverImageFile = new File(pdfFile.getParent(), pdfFile.getName().replace(".pdf", "_cover.jpg")); From 0807feae4f99e7f1939850299880175069a9c3bf Mon Sep 17 00:00:00 2001 From: jmal Date: Thu, 1 Aug 2024 16:17:11 +0800 Subject: [PATCH 4/4] refactor: Optimize code --- .../com/jmal/clouddisk/service/impl/FileServiceImpl.java | 8 ++------ .../java/com/jmal/clouddisk/util/FileContentUtil.java | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java b/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java index d8faa40c..7abb2b12 100644 --- a/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java +++ b/src/main/java/com/jmal/clouddisk/service/impl/FileServiceImpl.java @@ -19,6 +19,8 @@ import com.jmal.clouddisk.exception.ExceptionType; import com.jmal.clouddisk.interceptor.AuthInterceptor; import com.jmal.clouddisk.lucene.LuceneService; +import com.jmal.clouddisk.media.VideoInfo; +import com.jmal.clouddisk.media.VideoProcessService; import com.jmal.clouddisk.model.*; import com.jmal.clouddisk.model.query.SearchDTO; import com.jmal.clouddisk.model.rbac.ConsumerDO; @@ -29,8 +31,6 @@ import com.jmal.clouddisk.service.IFileService; import com.jmal.clouddisk.service.IFileVersionService; import com.jmal.clouddisk.util.*; -import com.jmal.clouddisk.media.VideoInfo; -import com.jmal.clouddisk.media.VideoProcessService; import com.jmal.clouddisk.webdav.MyWebdavServlet; import com.mongodb.client.AggregateIterable; import io.reactivex.rxjava3.core.Single; @@ -50,7 +50,6 @@ import org.mozilla.universalchardet.ReaderFactory; import org.springframework.beans.BeanUtils; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.data.domain.Sort; import org.springframework.data.mongodb.core.query.Criteria; import org.springframework.data.mongodb.core.query.Query; @@ -110,9 +109,6 @@ public class FileServiceImpl extends CommonFileService implements IFileService { LuceneService luceneService; private static final AES aes = SecureUtil.aes(); - @Qualifier("commonFileService") - @Autowired - private CommonFileService commonFileService; @Override public ResponseResult listFiles(UploadApiParamDTO upload) throws CommonException { diff --git a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java index 122c6847..21b1efda 100644 --- a/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java +++ b/src/main/java/com/jmal/clouddisk/util/FileContentUtil.java @@ -42,7 +42,7 @@ public static File epubCoverImage(Book book, String outputPath) { if (coverImage != null) { File coverImageFile = getCoverPath(outputPath); try (InputStream coverImageInputStream = coverImage.getInputStream(); - OutputStream coverImageOutput = new FileOutputStream(coverImageFile);) { + OutputStream coverImageOutput = new FileOutputStream(coverImageFile)) { byte[] buffer = new byte[1024]; int bytesRead; while ((bytesRead = coverImageInputStream.read(buffer)) != -1) {