Skip to content

Commit

Permalink
updated convertor for using box token
Browse files Browse the repository at this point in the history
  • Loading branch information
ujjwal-ibm committed Dec 16, 2024
1 parent 09e5214 commit 716f61b
Showing 1 changed file with 110 additions and 58 deletions.
168 changes: 110 additions & 58 deletions src/boxtodocx/convertor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,91 +31,143 @@ def convert(
credentials: Optional[Dict[str, str]] = None,
api_token: Optional[str] = None
) -> Tuple[Path, Path, List[Path]]:
"""
Convert a single Box document to HTML and DOCX.
Args:
filepath: Path to Box document
credentials: Optional Box credentials for image download
api_token: Optional Box API token for direct download
Returns:
Tuple of (HTML path, DOCX path, list of image paths)
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If file format is invalid
"""
"""Convert a single Box document."""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"Input file not found: {filepath}")

try:
logger.info(f"Converting {filepath}")
docx_dir, assets_dir = self._setup_output_dirs(filepath)
# Convert to HTML first
html_path, image_paths = self.html_handler.convert_file(
str(filepath),
credentials,
api_token
)

# Convert HTML to DOCX
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()

docx_path = self.output_dir / filepath.with_suffix('.docx').name
self.docx_handler.convert_html_to_docx(html_content, docx_path, assets_dir)
# Set up all paths
docx_path, assets_dir, html_path = self._setup_paths(filepath)

logger.info(f"Conversion completed: {filepath}")
return html_path, docx_path, image_paths
# Initialize HTML handler with the assets directory
html_handler = HTMLHandler(str(assets_dir))
if api_token:
html_handler.set_api_token(api_token)

# Convert to HTML first
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)

if not isinstance(data, dict):
raise ValueError("Invalid BoxNote format: root must be an object")

content = data.get('doc', {}).get('content', [])
if not isinstance(content, list):
raise ValueError("Invalid BoxNote format: content must be a list")

html_content, image_paths = html_handler.convert_to_html(
content,
credentials,
api_token
)

# Write HTML file
with open(str(html_path), 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Created HTML file: {html_path}")

# Convert HTML to DOCX
docx_handler = DOCXHandler()
docx_handler.convert_html_to_docx(
html_content,
str(docx_path), # Convert Path to string
str(assets_dir) # Convert Path to string
)
logger.info(f"Created DOCX file: {docx_path}")

return html_path, docx_path, image_paths

except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in input file: {str(e)}")
raise

except Exception as e:
logger.error(f"Conversion failed for {filepath}: {str(e)}")
raise

def _setup_output_dirs(self, input_path: Path) -> Tuple[Path, Path]:
"""Set up output directories for a given input file."""
docx_dir = input_path.parent
assets_dir = self.base_output_dir / input_path.stem
def _setup_output_dirs(self, input_path: Path) -> Tuple[Path, Path, Path]:
"""
Set up output directories for a given input file.
# Ensure directories exist
docx_dir.mkdir(parents=True, exist_ok=True)
assets_dir.mkdir(parents=True, exist_ok=True)
Args:
input_path: Path to input file
Returns:
Tuple of (docx_path, html_dir, html_path)
"""
# DOCX goes next to the input file
docx_path = input_path.with_suffix('.docx')

return docx_dir, assets_dir

def convert_directory(
self,
directory: Union[str, Path],
credentials: Optional[Dict[str, str]] = None
) -> List[Tuple[Path, Path, List[Path]]]:
# Create a folder with the same name as the boxnote for HTML and images
html_dir = input_path.parent / input_path.stem
html_dir.mkdir(parents=True, exist_ok=True)

# HTML file path inside the folder
html_path = html_dir / input_path.with_suffix('.html').name

logger.debug(f"Set up paths - DOCX: {docx_path}, HTML Dir: {html_dir}, HTML: {html_path}")
return docx_path, html_dir, html_path

def _setup_paths(self, input_path: Path) -> Tuple[Path, Path, Path]:
"""
Convert all Box documents in a directory.
Set up all output paths for a given input file.
Args:
directory: Directory containing Box documents
credentials: Optional Box credentials for image download
input_path: Path to Box note file
Returns:
List of (HTML path, DOCX path, image paths) tuples
Raises:
FileNotFoundError: If directory doesn't exist
Tuple of (docx_path, assets_dir, html_path)
"""
# DOCX goes next to the original file
docx_path = input_path.with_suffix('.docx')

# Create a folder with the same name as the boxnote
assets_dir = input_path.parent / input_path.stem
assets_dir.mkdir(parents=True, exist_ok=True)

# HTML goes inside the folder
html_path = assets_dir / input_path.with_suffix('.html').name

# Create images folder inside the assets directory
images_dir = assets_dir / 'images'
images_dir.mkdir(parents=True, exist_ok=True)

logger.info(f"Processing {input_path.name}:")
logger.info(f"- DOCX will be created at: {docx_path}")
logger.info(f"- HTML and images will be in: {assets_dir}")

return docx_path, assets_dir, html_path

def convert_directory(
self,
directory: Union[str, Path],
credentials: Optional[Dict[str, str]] = None,
api_token: Optional[str] = None
) -> List[Tuple[Path, Path, List[Path]]]:
"""Convert all Box documents in a directory."""
directory = Path(directory)
if not directory.exists():
raise FileNotFoundError(f"Directory not found: {directory}")

results = []
for filepath in directory.glob('*.boxnote'):
boxnotes = list(directory.glob('*.boxnote'))

if not boxnotes:
logger.warning(f"No .boxnote files found in {directory}")
return results

logger.info(f"Found {len(boxnotes)} .boxnote files in {directory}")

for filepath in boxnotes:
try:
result = self.convert(filepath, credentials)
result = self.convert(filepath, credentials, api_token)
results.append(result)
except Exception as e:
logger.error(f"Failed to convert {filepath}: {str(e)}")
logger.error(f"Failed to convert {filepath.name}: {str(e)}")
continue

return results

@staticmethod
Expand Down

0 comments on commit 716f61b

Please sign in to comment.