Merge branch 'ucbepic:main' into main

staru09 · Nov 27, 2024 · dc3f641 · dc3f641
2 parents 8a4a47a + bbd54bc
commit dc3f641
Show file tree

Hide file tree

Showing 60 changed files with 7,135 additions and 1,751 deletions.
diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml
@@ -0,0 +1,59 @@
+name: Docker CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  docker-build-test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Remove unnecessary files
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+    - uses: actions/checkout@v4
+    - name: Remove .env copy from Dockerfile
+      run: sed -i '/COPY .env/d' Dockerfile
+
+    - name: Build Docker image
+      run: docker build -t docetl .
+
+    - name: Create Docker volume
+      run: docker volume create docetl-data
+
+    - name: Test Docker container
+      run: |
+        # Run the container in detached mode
+        docker run -d \
+          -p 3000:3000 \
+          -p 8000:8000 \
+          -v docetl-data:/docetl-data \
+          -e FRONTEND_HOST=0.0.0.0 \
+          -e FRONTEND_PORT=3000 \
+          -e BACKEND_HOST=0.0.0.0 \
+          -e BACKEND_PORT=8000 \
+          --name docetl-test \
+          docetl
+          
+        # Wait for container to start up
+        sleep 120
+        
+        # Check if container is still running
+        if [ "$(docker ps -q -f name=docetl-test)" ]; then
+          echo "Container is running successfully"
+        else
+          echo "Container failed to stay running"
+          docker logs docetl-test
+          exit 1
+        fi
+        
+        # Cleanup
+        docker stop docetl-test
+        docker rm docetl-test
+        
+    - name: Clean up Docker volume
+      run: docker volume rm docetl-data 
diff --git a/.gitignore b/.gitignore
@@ -49,4 +49,7 @@ website/.vercel
 
 # typescript
 website/*.tsbuildinfo
-website/next-env.d.ts
+website/next-env.d.ts
+
+# Docker
+.docker/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,79 @@
+# Build stage for Python dependencies
+FROM python:3.11-slim AS python-builder
+
+RUN pip install poetry==1.4.2
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache \
+    DOCETL_HOME_DIR="/docetl-data"
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+COPY docetl/ ./docetl/
+COPY server/ ./server/
+COPY tests/ ./tests/
+RUN touch README.md
+
+# Install with --no-root first for dependencies, then install with root for entrypoints
+RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \
+    poetry install --all-extras
+
+# Build stage for Node.js dependencies
+FROM node:20-alpine AS node-builder
+
+WORKDIR /app/website
+
+# Update DOCETL_HOME_DIR to match final location
+ENV DOCETL_HOME_DIR="/docetl-data"
+
+COPY website/package*.json ./
+RUN npm install
+COPY website/ ./
+RUN npm run build
+
+# Final runtime stage
+FROM python:3.11-slim AS runtime
+
+# Install Node.js
+RUN apt-get update && apt-get install -y \
+    curl \
+    && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy Python virtual environment from builder
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH" \
+    PYTHONPATH="/app" \
+    DOCETL_HOME_DIR="/docetl-data"
+
+COPY --from=python-builder /app/.venv ${VIRTUAL_ENV}
+
+# Copy Python application files
+COPY docetl/ ./docetl/
+COPY server/ ./server/
+COPY tests/ ./tests/
+COPY pyproject.toml poetry.lock ./
+COPY .env ./
+
+# Copy Node.js dependencies and application files
+COPY --from=node-builder /app/website ./website
+
+ENV PORT=3000
+
+# Create data directory with appropriate permissions
+RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data
+
+# Define volume AFTER creating and setting permissions
+VOLUME ["/docetl-data"]
+
+# Expose ports for frontend and backend
+EXPOSE 3000 8000
+
+# Start both servers
+CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"]
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 # Load environment variables from .env file
 include .env
 
-.PHONY: tests tests-basic lint install mypy update ui-install ui-run
+.PHONY: tests tests-basic lint install mypy update ui-install ui-run docker
 
 # Existing commands
 tests:
@@ -25,7 +25,7 @@ mypy:
 update:
 	poetry update
 
-# New UI-related commands
+# UI-related commands
 UI_DIR := ./website 
 
 install-ui:
@@ -43,6 +43,24 @@ run-ui:
 	echo "Building UI..." && \
 	cd $(UI_DIR) && npm run build && HOST=${FRONTEND_HOST}  PORT=${FRONTEND_PORT} NEXT_PUBLIC_FRONTEND_ALLOWED_HOSTS=${FRONTEND_ALLOWED_HOSTS} npm run start
 
+# Single Docker command to build and run
+docker:
+	docker volume create docetl-data && \
+	docker build -t docetl . && \
+	docker run --rm -it \
+		-p 3000:3000 \
+		-p 8000:8000 \
+		-v docetl-data:/docetl-data \
+		-e FRONTEND_HOST=0.0.0.0 \
+		-e FRONTEND_PORT=3000 \
+		-e BACKEND_HOST=0.0.0.0 \
+		-e BACKEND_PORT=8000 \
+		docetl
+
+# Add new command for cleaning up docker resources
+docker-clean:
+	docker volume rm docetl-data
+
 # Help command
 help:
 	@echo "Available commands:"
@@ -54,5 +72,7 @@ help:
 	@echo "  make update       : Update dependencies"
 	@echo "  make install-ui   : Install UI dependencies"
 	@echo "  make run-ui-dev   : Run UI development server"
-	@echo "  make run-ui-prod  : Run UI production server"
+	@echo "  make run-ui       : Run UI production server"
+	@echo "  make docker       : Build and run docetl in Docker"
+	@echo "  make docker-clean : Remove docetl Docker volume"
 	@echo "  make help         : Show this help message"
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# DocETL: Powering Complex Document Processing Pipelines
+# 📜 DocETL: Powering Complex Document Processing Pipelines
 
 [![Website](https://img.shields.io/badge/Website-docetl.org-blue)](https://docetl.org)
 [![Documentation](https://img.shields.io/badge/Documentation-docs-green)](https://ucbepic.github.io/docetl)
@@ -7,65 +7,119 @@
 
 ![DocETL Figure](docs/assets/readmefig.png)
 
-DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers a low-code, declarative YAML interface to define LLM-powered operations on complex data.
+DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers:
 
-## When to Use DocETL
+1. An interactive UI playground for iterative prompt engineering and pipeline development
+2. A Python package for running production pipelines from the command line or Python code
 
-DocETL is the ideal choice when you're looking to maximize correctness and output quality for complex tasks over a collection of documents or unstructured datasets. You should consider using DocETL if:
-
-- You want to perform semantic processing on a collection of data
-- You have complex tasks that you want to represent via map-reduce
-- You're unsure how to best express your task to maximize LLM accuracy
-- You're working with long documents that don't fit into a single prompt
-- You have validation criteria and want tasks to automatically retry when validation fails
-
-## Community Projects
+### 🌟 Community Projects
 
 - [Conversation Generator](https://github.com/PassionFruits-net/docetl-conversation)
 - [Text-to-speech](https://github.com/PassionFruits-net/docetl-speaker)
 - [YouTube Transcript Topic Extraction](https://github.com/rajib76/docetl_examples)
 
-## Educational Resources
+### 📚 Educational Resources
 
 - [UI/UX Thoughts](https://x.com/sh_reya/status/1846235904664273201)
 - [Using Gleaning to Improve Output Quality](https://x.com/sh_reya/status/1843354256335876262)
 - [Deep Dive on Resolve Operator](https://x.com/sh_reya/status/1840796824636121288)
 
-## Installation
 
-### Prerequisites
+## 🚀 Getting Started
+
+There are two main ways to use DocETL:
+
+### 1. 🎮 Interactive UI Playground (Recommended for Development)
+
+The [UI Playground](https://ucbepic.github.io/docetl/playground/) helps you iteratively develop your pipeline:
+- Experiment with different prompts and see results in real-time
+- Build your pipeline step by step
+- Export your finalized pipeline configuration for production use
+
+![DocETL Playground](docs/assets/tutorial/playground-screenshot.png)
+
+To run the playground locally, you can either:
+- Use Docker (recommended for quick start): `make docker`
+- Set up the development environment manually
 
+See the [Playground Setup Guide](https://ucbepic.github.io/docetl/playground/) for detailed instructions.
+
+### 2. 📦 Python Package (For Production Use)
+
+If you want to use DocETL as a Python package:
+
+#### Prerequisites
 - Python 3.10 or later
 - OpenAI API key
 
-### Quick Start
-
-1. Install from PyPI:
 ```bash
 pip install docetl
 ```
 
+Create a `.env` file in your project directory:
+```bash
+OPENAI_API_KEY=your_api_key_here  # Required for LLM operations (or the key for the LLM of your choice)
+```
+
 To see examples of how to use DocETL, check out the [tutorial](https://ucbepic.github.io/docetl/tutorial/).
 
-### Running the UI Locally
+### 2. 🎮 UI Playground Setup
 
-We offer a simple UI for building pipelines. We recommend building up complex pipelines one operation at a time, so you can see the results of each operation as you go and iterate on your pipeline. To run it locally, follow these steps:
+To run the UI playground locally, you have two options:
 
-![Playground Screenshot](docs/assets/tutorial/playground-screenshot.png)
+#### Option A: Using Docker (Recommended for Quick Start)
 
-1. Clone the repository:
+The easiest way to get the playground running:
+
+1. Create the required environment files:
+
+Create `.env` in the root directory:
 ```bash
-git clone https://github.com/ucbepic/docetl.git
-cd docetl
+OPENAI_API_KEY=your_api_key_here
+BACKEND_ALLOW_ORIGINS=
+BACKEND_HOST=0.0.0.0
+BACKEND_PORT=8000
+BACKEND_RELOAD=True
+FRONTEND_HOST=0.0.0.0
+FRONTEND_PORT=3000
 ```
 
-2. Install dependencies:
+Create `.env.local` in the `website` directory:
 ```bash
-make install      # Install Python package
-make install-ui   # Install UI dependencies
+OPENAI_API_KEY=sk-xxx
+OPENAI_API_BASE=https://api.openai.com/v1
+MODEL_NAME=gpt-4o-mini
+
+NEXT_PUBLIC_BACKEND_HOST=localhost
+NEXT_PUBLIC_BACKEND_PORT=8000
+```
+
+2. Run Docker:
+```bash
+make docker
+```
+
+This will:
+- Create a Docker volume for persistent data
+- Build the DocETL image
+- Run the container with the UI accessible at http://localhost:3000
+
+To clean up Docker resources (note that this will delete the Docker volume):
+```bash
+make docker-clean
+```
+
+#### Option B: Manual Setup (Development)
+
+For development or if you prefer not to use Docker:
+
+1. Clone the repository:
+```bash
+git clone https://github.com/ucbepic/docetl.git
+cd docetl
 ```
 
-3. Set up environment variables in `.env`:
+2. Set up environment variables in `.env` in the root/top-level directory:
 ```bash
 OPENAI_API_KEY=your_api_key_here
 BACKEND_ALLOW_ORIGINS=
@@ -76,14 +130,32 @@ FRONTEND_HOST=0.0.0.0
 FRONTEND_PORT=3000
 ```
 
+And create an .env.local file in the `website` directory with the following:
+```bash
+OPENAI_API_KEY=sk-xxx
+OPENAI_API_BASE=https://api.openai.com/v1
+MODEL_NAME=gpt-4o-mini
+
+NEXT_PUBLIC_BACKEND_HOST=localhost
+NEXT_PUBLIC_BACKEND_PORT=8000
+```
+
+3. Install dependencies:
+```bash
+make install      # Install Python package
+make install-ui   # Install UI dependencies
+```
+
+Note that the OpenAI API key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine.
+
 4. Start the development server:
 ```bash
 make run-ui-dev
 ```
 
-5. Visit http://localhost:3000/playground
+5. Visit http://localhost:3000/playground to access the interactive UI.
 
-### Development Setup
+### 🛠️ Development Setup
 
 If you're planning to contribute or modify DocETL, you can verify your setup by running the test suite: