Skip to content

Commit

Permalink
Merge branch 'ucbepic:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
staru09 authored Nov 27, 2024
2 parents 8a4a47a + bbd54bc commit dc3f641
Show file tree
Hide file tree
Showing 60 changed files with 7,135 additions and 1,751 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/docker-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Docker CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
docker-build-test:
runs-on: ubuntu-latest

steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v4
- name: Remove .env copy from Dockerfile
run: sed -i '/COPY .env/d' Dockerfile

- name: Build Docker image
run: docker build -t docetl .

- name: Create Docker volume
run: docker volume create docetl-data

- name: Test Docker container
run: |
# Run the container in detached mode
docker run -d \
-p 3000:3000 \
-p 8000:8000 \
-v docetl-data:/docetl-data \
-e FRONTEND_HOST=0.0.0.0 \
-e FRONTEND_PORT=3000 \
-e BACKEND_HOST=0.0.0.0 \
-e BACKEND_PORT=8000 \
--name docetl-test \
docetl
# Wait for container to start up
sleep 120
# Check if container is still running
if [ "$(docker ps -q -f name=docetl-test)" ]; then
echo "Container is running successfully"
else
echo "Container failed to stay running"
docker logs docetl-test
exit 1
fi
# Cleanup
docker stop docetl-test
docker rm docetl-test
- name: Clean up Docker volume
run: docker volume rm docetl-data
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,7 @@ website/.vercel

# typescript
website/*.tsbuildinfo
website/next-env.d.ts
website/next-env.d.ts

# Docker
.docker/
79 changes: 79 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Build stage for Python dependencies
FROM python:3.11-slim AS python-builder

RUN pip install poetry==1.4.2

ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1 \
POETRY_CACHE_DIR=/tmp/poetry_cache \
DOCETL_HOME_DIR="/docetl-data"

WORKDIR /app

COPY pyproject.toml poetry.lock ./
COPY docetl/ ./docetl/
COPY server/ ./server/
COPY tests/ ./tests/
RUN touch README.md

# Install with --no-root first for dependencies, then install with root for entrypoints
RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \
poetry install --all-extras

# Build stage for Node.js dependencies
FROM node:20-alpine AS node-builder

WORKDIR /app/website

# Update DOCETL_HOME_DIR to match final location
ENV DOCETL_HOME_DIR="/docetl-data"

COPY website/package*.json ./
RUN npm install
COPY website/ ./
RUN npm run build

# Final runtime stage
FROM python:3.11-slim AS runtime

# Install Node.js
RUN apt-get update && apt-get install -y \
curl \
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
&& apt-get install -y nodejs \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Copy Python virtual environment from builder
ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH" \
PYTHONPATH="/app" \
DOCETL_HOME_DIR="/docetl-data"

COPY --from=python-builder /app/.venv ${VIRTUAL_ENV}

# Copy Python application files
COPY docetl/ ./docetl/
COPY server/ ./server/
COPY tests/ ./tests/
COPY pyproject.toml poetry.lock ./
COPY .env ./

# Copy Node.js dependencies and application files
COPY --from=node-builder /app/website ./website

ENV PORT=3000

# Create data directory with appropriate permissions
RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data

# Define volume AFTER creating and setting permissions
VOLUME ["/docetl-data"]

# Expose ports for frontend and backend
EXPOSE 3000 8000

# Start both servers
CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"]
26 changes: 23 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Load environment variables from .env file
include .env

.PHONY: tests tests-basic lint install mypy update ui-install ui-run
.PHONY: tests tests-basic lint install mypy update ui-install ui-run docker

# Existing commands
tests:
Expand All @@ -25,7 +25,7 @@ mypy:
update:
poetry update

# New UI-related commands
# UI-related commands
UI_DIR := ./website

install-ui:
Expand All @@ -43,6 +43,24 @@ run-ui:
echo "Building UI..." && \
cd $(UI_DIR) && npm run build && HOST=${FRONTEND_HOST} PORT=${FRONTEND_PORT} NEXT_PUBLIC_FRONTEND_ALLOWED_HOSTS=${FRONTEND_ALLOWED_HOSTS} npm run start

# Single Docker command to build and run
docker:
docker volume create docetl-data && \
docker build -t docetl . && \
docker run --rm -it \
-p 3000:3000 \
-p 8000:8000 \
-v docetl-data:/docetl-data \
-e FRONTEND_HOST=0.0.0.0 \
-e FRONTEND_PORT=3000 \
-e BACKEND_HOST=0.0.0.0 \
-e BACKEND_PORT=8000 \
docetl

# Add new command for cleaning up docker resources
docker-clean:
docker volume rm docetl-data

# Help command
help:
@echo "Available commands:"
Expand All @@ -54,5 +72,7 @@ help:
@echo " make update : Update dependencies"
@echo " make install-ui : Install UI dependencies"
@echo " make run-ui-dev : Run UI development server"
@echo " make run-ui-prod : Run UI production server"
@echo " make run-ui : Run UI production server"
@echo " make docker : Build and run docetl in Docker"
@echo " make docker-clean : Remove docetl Docker volume"
@echo " make help : Show this help message"
132 changes: 102 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# DocETL: Powering Complex Document Processing Pipelines
# 📜 DocETL: Powering Complex Document Processing Pipelines

[![Website](https://img.shields.io/badge/Website-docetl.org-blue)](https://docetl.org)
[![Documentation](https://img.shields.io/badge/Documentation-docs-green)](https://ucbepic.github.io/docetl)
Expand All @@ -7,65 +7,119 @@

![DocETL Figure](docs/assets/readmefig.png)

DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers a low-code, declarative YAML interface to define LLM-powered operations on complex data.
DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. It offers:

## When to Use DocETL
1. An interactive UI playground for iterative prompt engineering and pipeline development
2. A Python package for running production pipelines from the command line or Python code

DocETL is the ideal choice when you're looking to maximize correctness and output quality for complex tasks over a collection of documents or unstructured datasets. You should consider using DocETL if:

- You want to perform semantic processing on a collection of data
- You have complex tasks that you want to represent via map-reduce
- You're unsure how to best express your task to maximize LLM accuracy
- You're working with long documents that don't fit into a single prompt
- You have validation criteria and want tasks to automatically retry when validation fails

## Community Projects
### 🌟 Community Projects

- [Conversation Generator](https://github.com/PassionFruits-net/docetl-conversation)
- [Text-to-speech](https://github.com/PassionFruits-net/docetl-speaker)
- [YouTube Transcript Topic Extraction](https://github.com/rajib76/docetl_examples)

## Educational Resources
### 📚 Educational Resources

- [UI/UX Thoughts](https://x.com/sh_reya/status/1846235904664273201)
- [Using Gleaning to Improve Output Quality](https://x.com/sh_reya/status/1843354256335876262)
- [Deep Dive on Resolve Operator](https://x.com/sh_reya/status/1840796824636121288)

## Installation

### Prerequisites
## 🚀 Getting Started

There are two main ways to use DocETL:

### 1. 🎮 Interactive UI Playground (Recommended for Development)

The [UI Playground](https://ucbepic.github.io/docetl/playground/) helps you iteratively develop your pipeline:
- Experiment with different prompts and see results in real-time
- Build your pipeline step by step
- Export your finalized pipeline configuration for production use

![DocETL Playground](docs/assets/tutorial/playground-screenshot.png)

To run the playground locally, you can either:
- Use Docker (recommended for quick start): `make docker`
- Set up the development environment manually

See the [Playground Setup Guide](https://ucbepic.github.io/docetl/playground/) for detailed instructions.

### 2. 📦 Python Package (For Production Use)

If you want to use DocETL as a Python package:

#### Prerequisites
- Python 3.10 or later
- OpenAI API key

### Quick Start

1. Install from PyPI:
```bash
pip install docetl
```

Create a `.env` file in your project directory:
```bash
OPENAI_API_KEY=your_api_key_here # Required for LLM operations (or the key for the LLM of your choice)
```

To see examples of how to use DocETL, check out the [tutorial](https://ucbepic.github.io/docetl/tutorial/).

### Running the UI Locally
### 2. 🎮 UI Playground Setup

We offer a simple UI for building pipelines. We recommend building up complex pipelines one operation at a time, so you can see the results of each operation as you go and iterate on your pipeline. To run it locally, follow these steps:
To run the UI playground locally, you have two options:

![Playground Screenshot](docs/assets/tutorial/playground-screenshot.png)
#### Option A: Using Docker (Recommended for Quick Start)

1. Clone the repository:
The easiest way to get the playground running:

1. Create the required environment files:

Create `.env` in the root directory:
```bash
git clone https://github.com/ucbepic/docetl.git
cd docetl
OPENAI_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=0.0.0.0
BACKEND_PORT=8000
BACKEND_RELOAD=True
FRONTEND_HOST=0.0.0.0
FRONTEND_PORT=3000
```

2. Install dependencies:
Create `.env.local` in the `website` directory:
```bash
make install # Install Python package
make install-ui # Install UI dependencies
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
MODEL_NAME=gpt-4o-mini

NEXT_PUBLIC_BACKEND_HOST=localhost
NEXT_PUBLIC_BACKEND_PORT=8000
```

2. Run Docker:
```bash
make docker
```

This will:
- Create a Docker volume for persistent data
- Build the DocETL image
- Run the container with the UI accessible at http://localhost:3000

To clean up Docker resources (note that this will delete the Docker volume):
```bash
make docker-clean
```

#### Option B: Manual Setup (Development)

For development or if you prefer not to use Docker:

1. Clone the repository:
```bash
git clone https://github.com/ucbepic/docetl.git
cd docetl
```

3. Set up environment variables in `.env`:
2. Set up environment variables in `.env` in the root/top-level directory:
```bash
OPENAI_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
Expand All @@ -76,14 +130,32 @@ FRONTEND_HOST=0.0.0.0
FRONTEND_PORT=3000
```

And create an .env.local file in the `website` directory with the following:
```bash
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
MODEL_NAME=gpt-4o-mini

NEXT_PUBLIC_BACKEND_HOST=localhost
NEXT_PUBLIC_BACKEND_PORT=8000
```

3. Install dependencies:
```bash
make install # Install Python package
make install-ui # Install UI dependencies
```

Note that the OpenAI API key, base, and model name are for the UI assistant only; not the DocETL pipeline execution engine.

4. Start the development server:
```bash
make run-ui-dev
```

5. Visit http://localhost:3000/playground
5. Visit http://localhost:3000/playground to access the interactive UI.

### Development Setup
### 🛠️ Development Setup

If you're planning to contribute or modify DocETL, you can verify your setup by running the test suite:

Expand Down
Loading

0 comments on commit dc3f641

Please sign in to comment.