From 716f61b0e005685540d51d524d1bff6abd95c338 Mon Sep 17 00:00:00 2001 From: Ujjwal Kumar Date: Mon, 16 Dec 2024 12:15:28 +0530 Subject: [PATCH] updated convertor for using box token --- src/boxtodocx/convertor.py | 168 ++++++++++++++++++++++++------------- 1 file changed, 110 insertions(+), 58 deletions(-) diff --git a/src/boxtodocx/convertor.py b/src/boxtodocx/convertor.py index e96624c..c0b4f2c 100644 --- a/src/boxtodocx/convertor.py +++ b/src/boxtodocx/convertor.py @@ -31,91 +31,143 @@ def convert( credentials: Optional[Dict[str, str]] = None, api_token: Optional[str] = None ) -> Tuple[Path, Path, List[Path]]: - """ - Convert a single Box document to HTML and DOCX. - - Args: - filepath: Path to Box document - credentials: Optional Box credentials for image download - api_token: Optional Box API token for direct download - - Returns: - Tuple of (HTML path, DOCX path, list of image paths) - - Raises: - FileNotFoundError: If input file doesn't exist - ValueError: If file format is invalid - """ + """Convert a single Box document.""" filepath = Path(filepath) if not filepath.exists(): raise FileNotFoundError(f"Input file not found: {filepath}") try: - logger.info(f"Converting {filepath}") - docx_dir, assets_dir = self._setup_output_dirs(filepath) - # Convert to HTML first - html_path, image_paths = self.html_handler.convert_file( - str(filepath), - credentials, - api_token - ) - - # Convert HTML to DOCX - with open(html_path, 'r', encoding='utf-8') as f: - html_content = f.read() - - docx_path = self.output_dir / filepath.with_suffix('.docx').name - self.docx_handler.convert_html_to_docx(html_content, docx_path, assets_dir) + # Set up all paths + docx_path, assets_dir, html_path = self._setup_paths(filepath) - logger.info(f"Conversion completed: {filepath}") - return html_path, docx_path, image_paths + # Initialize HTML handler with the assets directory + html_handler = HTMLHandler(str(assets_dir)) + if api_token: + html_handler.set_api_token(api_token) + # Convert to HTML first + try: + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + + if not isinstance(data, dict): + raise ValueError("Invalid BoxNote format: root must be an object") + + content = data.get('doc', {}).get('content', []) + if not isinstance(content, list): + raise ValueError("Invalid BoxNote format: content must be a list") + + html_content, image_paths = html_handler.convert_to_html( + content, + credentials, + api_token + ) + + # Write HTML file + with open(str(html_path), 'w', encoding='utf-8') as f: + f.write(html_content) + logger.info(f"Created HTML file: {html_path}") + + # Convert HTML to DOCX + docx_handler = DOCXHandler() + docx_handler.convert_html_to_docx( + html_content, + str(docx_path), # Convert Path to string + str(assets_dir) # Convert Path to string + ) + logger.info(f"Created DOCX file: {docx_path}") + + return html_path, docx_path, image_paths + + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in input file: {str(e)}") + raise + except Exception as e: logger.error(f"Conversion failed for {filepath}: {str(e)}") raise - def _setup_output_dirs(self, input_path: Path) -> Tuple[Path, Path]: - """Set up output directories for a given input file.""" - docx_dir = input_path.parent - assets_dir = self.base_output_dir / input_path.stem + def _setup_output_dirs(self, input_path: Path) -> Tuple[Path, Path, Path]: + """ + Set up output directories for a given input file. - # Ensure directories exist - docx_dir.mkdir(parents=True, exist_ok=True) - assets_dir.mkdir(parents=True, exist_ok=True) + Args: + input_path: Path to input file + + Returns: + Tuple of (docx_path, html_dir, html_path) + """ + # DOCX goes next to the input file + docx_path = input_path.with_suffix('.docx') - return docx_dir, assets_dir - - def convert_directory( - self, - directory: Union[str, Path], - credentials: Optional[Dict[str, str]] = None - ) -> List[Tuple[Path, Path, List[Path]]]: + # Create a folder with the same name as the boxnote for HTML and images + html_dir = input_path.parent / input_path.stem + html_dir.mkdir(parents=True, exist_ok=True) + + # HTML file path inside the folder + html_path = html_dir / input_path.with_suffix('.html').name + + logger.debug(f"Set up paths - DOCX: {docx_path}, HTML Dir: {html_dir}, HTML: {html_path}") + return docx_path, html_dir, html_path + + def _setup_paths(self, input_path: Path) -> Tuple[Path, Path, Path]: """ - Convert all Box documents in a directory. + Set up all output paths for a given input file. Args: - directory: Directory containing Box documents - credentials: Optional Box credentials for image download + input_path: Path to Box note file Returns: - List of (HTML path, DOCX path, image paths) tuples - - Raises: - FileNotFoundError: If directory doesn't exist + Tuple of (docx_path, assets_dir, html_path) """ + # DOCX goes next to the original file + docx_path = input_path.with_suffix('.docx') + + # Create a folder with the same name as the boxnote + assets_dir = input_path.parent / input_path.stem + assets_dir.mkdir(parents=True, exist_ok=True) + + # HTML goes inside the folder + html_path = assets_dir / input_path.with_suffix('.html').name + + # Create images folder inside the assets directory + images_dir = assets_dir / 'images' + images_dir.mkdir(parents=True, exist_ok=True) + + logger.info(f"Processing {input_path.name}:") + logger.info(f"- DOCX will be created at: {docx_path}") + logger.info(f"- HTML and images will be in: {assets_dir}") + + return docx_path, assets_dir, html_path + + def convert_directory( + self, + directory: Union[str, Path], + credentials: Optional[Dict[str, str]] = None, + api_token: Optional[str] = None + ) -> List[Tuple[Path, Path, List[Path]]]: + """Convert all Box documents in a directory.""" directory = Path(directory) if not directory.exists(): raise FileNotFoundError(f"Directory not found: {directory}") - + results = [] - for filepath in directory.glob('*.boxnote'): + boxnotes = list(directory.glob('*.boxnote')) + + if not boxnotes: + logger.warning(f"No .boxnote files found in {directory}") + return results + + logger.info(f"Found {len(boxnotes)} .boxnote files in {directory}") + + for filepath in boxnotes: try: - result = self.convert(filepath, credentials) + result = self.convert(filepath, credentials, api_token) results.append(result) except Exception as e: - logger.error(f"Failed to convert {filepath}: {str(e)}") + logger.error(f"Failed to convert {filepath.name}: {str(e)}") continue - + return results @staticmethod