From eb55746bbb52d90af536675ab75a87398a40a6d9 Mon Sep 17 00:00:00 2001 From: Ujjwal Kumar Date: Mon, 16 Dec 2024 13:55:19 +0530 Subject: [PATCH] some changes --- src/boxtodocx/handlers/docx_handler.py | 92 +++++++++++++++++++------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/src/boxtodocx/handlers/docx_handler.py b/src/boxtodocx/handlers/docx_handler.py index 59e153a..6840732 100644 --- a/src/boxtodocx/handlers/docx_handler.py +++ b/src/boxtodocx/handlers/docx_handler.py @@ -38,7 +38,12 @@ def _setup_document(self) -> None: font.name = DEFAULT_FONT_NAME font.size = Pt(DEFAULT_FONT_SIZE) - def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path], assets_dir: Union[str, Path] = None) -> Path: + def convert_html_to_docx( + self, + html_content: str, + output_path: Union[str, Path], + assets_dir: Optional[Union[str, Path]] = None + ) -> Path: """ Convert HTML content to DOCX format. @@ -46,23 +51,20 @@ def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path], html_content: HTML string to convert output_path: Path for output DOCX file assets_dir: Directory containing images and other assets - - Returns: - Path to created DOCX file - - Raises: - ValueError: If HTML content is invalid """ try: output_path = Path(output_path) self.assets_dir = Path(assets_dir) if assets_dir else output_path.parent - soup = BeautifulSoup(html_content, 'html.parser') + # Parse HTML content + soup = BeautifulSoup(html_content, 'html.parser') if not soup.body: raise ValueError("Invalid HTML: no body tag found") - + + # Process all elements self._process_elements(soup.body) + # Save the document output_path.parent.mkdir(parents=True, exist_ok=True) self.document.save(str(output_path)) @@ -72,6 +74,7 @@ def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path], except Exception as e: logger.error(f"DOCX conversion failed: {str(e)}") raise + def _process_elements(self, parent: BeautifulSoup) -> None: """Process HTML elements recursively.""" @@ -122,39 +125,82 @@ def _handle_table(self, element: BeautifulSoup) -> None: def _handle_image(self, element: BeautifulSoup) -> None: """Handle image element.""" try: + logger.info("Starting image handling in DOCX") src = element.get('src') + logger.debug(f"Image src attribute: {src}") + if not src: + logger.warning("Image element without src attribute") return - # Handle paths relative to images directory + # Log the current assets directory + logger.debug(f"Assets directory: {self.assets_dir}") + + # Construct and log image paths if src.startswith('images/'): img_path = self.assets_dir / src + logger.debug(f"Using relative path: {img_path}") else: img_path = self.assets_dir / 'images' / Path(src).name + logger.debug(f"Using direct filename: {img_path}") - logger.debug(f"Looking for image at: {img_path}") - + # Check file existence + logger.info(f"Checking image at path: {img_path}") if img_path.exists(): - width = Inches(6) # Default width + logger.info(f"Found image file: {img_path}") + + # Log file details + from pathlib import Path + file_size = Path(img_path).stat().st_size + logger.debug(f"Image file size: {file_size} bytes") + logger.debug(f"Image extension: {img_path.suffix}") + + # Set default and maximum widths + default_width = Inches(6) + max_width = Inches(6) + try: - # Get original size if possible from PIL import Image with Image.open(img_path) as img: w, h = img.size - # Scale to reasonable size while maintaining aspect ratio - if w > 600: # Max width in pixels - width = Inches(min(w / 96, 6)) # Convert pixels to inches + logger.debug(f"Original image dimensions: {w}x{h} pixels") + + # Calculate width while maintaining aspect ratio + if w > 600: + width = min(Inches(w/96), max_width) + logger.debug(f"Scaling large image to width: {width}") + else: + width = Inches(w/96) + logger.debug(f"Using original image width: {width}") + + except Exception as e: + logger.warning(f"Could not process image size, using default: {str(e)}") + width = default_width + + # Insert image into document + logger.info(f"Adding image to document: {img_path}") + try: + run = self.document.add_paragraph().add_run() + run.add_picture(str(img_path), width=width) + self.document.add_paragraph() # Add spacing + logger.info(f"Successfully inserted image: {img_path.name}") except Exception as e: - logger.debug(f"Could not get image size: {e}") + logger.error(f"Failed to insert image into document: {str(e)}") + logger.error(f"Image path used: {str(img_path)}") + raise - logger.debug(f"Inserting image: {img_path}") - self.document.add_picture(str(img_path), width=width) - self.document.add_paragraph() # Add spacing after image else: - logger.warning(f"Image not found: {img_path}") + # Log detailed path information if file not found + logger.error(f"Image not found at path: {img_path}") + logger.error(f"Absolute path: {img_path.absolute()}") + logger.error(f"Parent directory exists: {img_path.parent.exists()}") + if img_path.parent.exists(): + logger.error(f"Files in parent directory: {list(img_path.parent.glob('*'))}") except Exception as e: - logger.error(f"Error inserting image: {str(e)}") + logger.error(f"Error in image handling: {str(e)}") + import traceback + logger.error(f"Traceback: {traceback.format_exc()}") def _process_inline_elements(