From eb55746bbb52d90af536675ab75a87398a40a6d9 Mon Sep 17 00:00:00 2001
From: Ujjwal Kumar <ujjwal.kumar1@ibm.com>
Date: Mon, 16 Dec 2024 13:55:19 +0530
Subject: [PATCH] some changes

---
 src/boxtodocx/handlers/docx_handler.py | 92 +++++++++++++++++++-------
 1 file changed, 69 insertions(+), 23 deletions(-)

diff --git a/src/boxtodocx/handlers/docx_handler.py b/src/boxtodocx/handlers/docx_handler.py
index 59e153a..6840732 100644
--- a/src/boxtodocx/handlers/docx_handler.py
+++ b/src/boxtodocx/handlers/docx_handler.py
@@ -38,7 +38,12 @@ def _setup_document(self) -> None:
         font.name = DEFAULT_FONT_NAME
         font.size = Pt(DEFAULT_FONT_SIZE)
     
-    def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path], assets_dir: Union[str, Path] = None) -> Path:
+    def convert_html_to_docx(
+        self,
+        html_content: str,
+        output_path: Union[str, Path],
+        assets_dir: Optional[Union[str, Path]] = None
+    ) -> Path:
         """
         Convert HTML content to DOCX format.
         
@@ -46,23 +51,20 @@ def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path],
             html_content: HTML string to convert
             output_path: Path for output DOCX file
             assets_dir: Directory containing images and other assets
-
-        Returns:
-            Path to created DOCX file
-            
-        Raises:
-            ValueError: If HTML content is invalid
         """
         try:
             output_path = Path(output_path)
             self.assets_dir = Path(assets_dir) if assets_dir else output_path.parent
-            soup = BeautifulSoup(html_content, 'html.parser')
             
+            # Parse HTML content
+            soup = BeautifulSoup(html_content, 'html.parser')
             if not soup.body:
                 raise ValueError("Invalid HTML: no body tag found")
-                
+            
+            # Process all elements
             self._process_elements(soup.body)
             
+            # Save the document
             output_path.parent.mkdir(parents=True, exist_ok=True)
             self.document.save(str(output_path))
             
@@ -72,6 +74,7 @@ def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path],
         except Exception as e:
             logger.error(f"DOCX conversion failed: {str(e)}")
             raise
+
     
     def _process_elements(self, parent: BeautifulSoup) -> None:
         """Process HTML elements recursively."""
@@ -122,39 +125,82 @@ def _handle_table(self, element: BeautifulSoup) -> None:
     def _handle_image(self, element: BeautifulSoup) -> None:
         """Handle image element."""
         try:
+            logger.info("Starting image handling in DOCX")
             src = element.get('src')
+            logger.debug(f"Image src attribute: {src}")
+            
             if not src:
+                logger.warning("Image element without src attribute")
                 return
 
-            # Handle paths relative to images directory
+            # Log the current assets directory
+            logger.debug(f"Assets directory: {self.assets_dir}")
+
+            # Construct and log image paths
             if src.startswith('images/'):
                 img_path = self.assets_dir / src
+                logger.debug(f"Using relative path: {img_path}")
             else:
                 img_path = self.assets_dir / 'images' / Path(src).name
+                logger.debug(f"Using direct filename: {img_path}")
 
-            logger.debug(f"Looking for image at: {img_path}")
-            
+            # Check file existence
+            logger.info(f"Checking image at path: {img_path}")
             if img_path.exists():
-                width = Inches(6)  # Default width
+                logger.info(f"Found image file: {img_path}")
+                
+                # Log file details
+                from pathlib import Path
+                file_size = Path(img_path).stat().st_size
+                logger.debug(f"Image file size: {file_size} bytes")
+                logger.debug(f"Image extension: {img_path.suffix}")
+
+                # Set default and maximum widths
+                default_width = Inches(6)
+                max_width = Inches(6)
+
                 try:
-                    # Get original size if possible
                     from PIL import Image
                     with Image.open(img_path) as img:
                         w, h = img.size
-                        # Scale to reasonable size while maintaining aspect ratio
-                        if w > 600:  # Max width in pixels
-                            width = Inches(min(w / 96, 6))  # Convert pixels to inches
+                        logger.debug(f"Original image dimensions: {w}x{h} pixels")
+                        
+                        # Calculate width while maintaining aspect ratio
+                        if w > 600:
+                            width = min(Inches(w/96), max_width)
+                            logger.debug(f"Scaling large image to width: {width}")
+                        else:
+                            width = Inches(w/96)
+                            logger.debug(f"Using original image width: {width}")
+
+                except Exception as e:
+                    logger.warning(f"Could not process image size, using default: {str(e)}")
+                    width = default_width
+
+                # Insert image into document
+                logger.info(f"Adding image to document: {img_path}")
+                try:
+                    run = self.document.add_paragraph().add_run()
+                    run.add_picture(str(img_path), width=width)
+                    self.document.add_paragraph()  # Add spacing
+                    logger.info(f"Successfully inserted image: {img_path.name}")
                 except Exception as e:
-                    logger.debug(f"Could not get image size: {e}")
+                    logger.error(f"Failed to insert image into document: {str(e)}")
+                    logger.error(f"Image path used: {str(img_path)}")
+                    raise
 
-                logger.debug(f"Inserting image: {img_path}")
-                self.document.add_picture(str(img_path), width=width)
-                self.document.add_paragraph()  # Add spacing after image
             else:
-                logger.warning(f"Image not found: {img_path}")
+                # Log detailed path information if file not found
+                logger.error(f"Image not found at path: {img_path}")
+                logger.error(f"Absolute path: {img_path.absolute()}")
+                logger.error(f"Parent directory exists: {img_path.parent.exists()}")
+                if img_path.parent.exists():
+                    logger.error(f"Files in parent directory: {list(img_path.parent.glob('*'))}")
 
         except Exception as e:
-            logger.error(f"Error inserting image: {str(e)}")
+            logger.error(f"Error in image handling: {str(e)}")
+            import traceback
+            logger.error(f"Traceback: {traceback.format_exc()}")
 
     
     def _process_inline_elements(