Skip to content

Commit

Permalink
some changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ujjwal-ibm committed Dec 16, 2024
1 parent 608e30c commit eb55746
Showing 1 changed file with 69 additions and 23 deletions.
92 changes: 69 additions & 23 deletions src/boxtodocx/handlers/docx_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,31 +38,33 @@ def _setup_document(self) -> None:
font.name = DEFAULT_FONT_NAME
font.size = Pt(DEFAULT_FONT_SIZE)

def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path], assets_dir: Union[str, Path] = None) -> Path:
def convert_html_to_docx(
self,
html_content: str,
output_path: Union[str, Path],
assets_dir: Optional[Union[str, Path]] = None
) -> Path:
"""
Convert HTML content to DOCX format.
Args:
html_content: HTML string to convert
output_path: Path for output DOCX file
assets_dir: Directory containing images and other assets
Returns:
Path to created DOCX file
Raises:
ValueError: If HTML content is invalid
"""
try:
output_path = Path(output_path)
self.assets_dir = Path(assets_dir) if assets_dir else output_path.parent
soup = BeautifulSoup(html_content, 'html.parser')

# Parse HTML content
soup = BeautifulSoup(html_content, 'html.parser')
if not soup.body:
raise ValueError("Invalid HTML: no body tag found")


# Process all elements
self._process_elements(soup.body)

# Save the document
output_path.parent.mkdir(parents=True, exist_ok=True)
self.document.save(str(output_path))

Expand All @@ -72,6 +74,7 @@ def convert_html_to_docx(self, html_content: str, output_path: Union[str, Path],
except Exception as e:
logger.error(f"DOCX conversion failed: {str(e)}")
raise


def _process_elements(self, parent: BeautifulSoup) -> None:
"""Process HTML elements recursively."""
Expand Down Expand Up @@ -122,39 +125,82 @@ def _handle_table(self, element: BeautifulSoup) -> None:
def _handle_image(self, element: BeautifulSoup) -> None:
"""Handle image element."""
try:
logger.info("Starting image handling in DOCX")
src = element.get('src')
logger.debug(f"Image src attribute: {src}")

if not src:
logger.warning("Image element without src attribute")
return

# Handle paths relative to images directory
# Log the current assets directory
logger.debug(f"Assets directory: {self.assets_dir}")

# Construct and log image paths
if src.startswith('images/'):
img_path = self.assets_dir / src
logger.debug(f"Using relative path: {img_path}")
else:
img_path = self.assets_dir / 'images' / Path(src).name
logger.debug(f"Using direct filename: {img_path}")

logger.debug(f"Looking for image at: {img_path}")

# Check file existence
logger.info(f"Checking image at path: {img_path}")
if img_path.exists():
width = Inches(6) # Default width
logger.info(f"Found image file: {img_path}")

# Log file details
from pathlib import Path
file_size = Path(img_path).stat().st_size
logger.debug(f"Image file size: {file_size} bytes")
logger.debug(f"Image extension: {img_path.suffix}")

# Set default and maximum widths
default_width = Inches(6)
max_width = Inches(6)

try:
# Get original size if possible
from PIL import Image
with Image.open(img_path) as img:
w, h = img.size
# Scale to reasonable size while maintaining aspect ratio
if w > 600: # Max width in pixels
width = Inches(min(w / 96, 6)) # Convert pixels to inches
logger.debug(f"Original image dimensions: {w}x{h} pixels")

# Calculate width while maintaining aspect ratio
if w > 600:
width = min(Inches(w/96), max_width)
logger.debug(f"Scaling large image to width: {width}")
else:
width = Inches(w/96)
logger.debug(f"Using original image width: {width}")

except Exception as e:
logger.warning(f"Could not process image size, using default: {str(e)}")
width = default_width

# Insert image into document
logger.info(f"Adding image to document: {img_path}")
try:
run = self.document.add_paragraph().add_run()
run.add_picture(str(img_path), width=width)
self.document.add_paragraph() # Add spacing
logger.info(f"Successfully inserted image: {img_path.name}")
except Exception as e:
logger.debug(f"Could not get image size: {e}")
logger.error(f"Failed to insert image into document: {str(e)}")
logger.error(f"Image path used: {str(img_path)}")
raise

logger.debug(f"Inserting image: {img_path}")
self.document.add_picture(str(img_path), width=width)
self.document.add_paragraph() # Add spacing after image
else:
logger.warning(f"Image not found: {img_path}")
# Log detailed path information if file not found
logger.error(f"Image not found at path: {img_path}")
logger.error(f"Absolute path: {img_path.absolute()}")
logger.error(f"Parent directory exists: {img_path.parent.exists()}")
if img_path.parent.exists():
logger.error(f"Files in parent directory: {list(img_path.parent.glob('*'))}")

except Exception as e:
logger.error(f"Error inserting image: {str(e)}")
logger.error(f"Error in image handling: {str(e)}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")


def _process_inline_elements(
Expand Down

0 comments on commit eb55746

Please sign in to comment.