Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Dagster Data pipeline #798

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python/tabby-eval/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tmp*
tabby_data_pipeline.egg-info
log.txt
48 changes: 48 additions & 0 deletions python/tabby-eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# tabby_data_pipeline

This is a [Dagster](https://dagster.io/) project scaffolded with [`dagster project scaffold`](https://docs.dagster.io/getting-started/create-new-project).

## Getting started

First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply.

```bash
pip install -e ".[dev]"
```

Then, start the Dagster UI web server:

```bash
dagster dev
```

Open http://localhost:3000 with your browser to see the project.

You can start writing assets in `tabby_data_pipeline/assets.py`. The assets are automatically loaded into the Dagster code location as you define them.

## Development


### Adding new Python dependencies

You can specify new Python dependencies in `setup.py`.

### Unit testing

Tests are in the `tabby_data_pipeline_tests` directory and you can run tests using `pytest`:

```bash
pytest tabby_data_pipeline_tests
```

### Schedules and sensors

If you want to enable Dagster [Schedules](https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules) or [Sensors](https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors) for your jobs, the [Dagster Daemon](https://docs.dagster.io/deployment/dagster-daemon) process must be running. This is done automatically when you run `dagster dev`.

Once your Dagster Daemon is running, you can start turning on schedules and sensors for your jobs.

## Deploy on Dagster Cloud

The easiest way to deploy your Dagster project is to use Dagster Cloud.

Check out the [Dagster Cloud Documentation](https://docs.dagster.cloud) to learn more.
452 changes: 452 additions & 0 deletions python/tabby-eval/edit_distance_analysis.ipynb

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions python/tabby-eval/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.dagster]
module_name = "tabby_data_pipeline"
2 changes: 2 additions & 0 deletions python/tabby-eval/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
name = tabby_data_pipeline
17 changes: 17 additions & 0 deletions python/tabby-eval/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from setuptools import find_packages, setup

setup(
name="tabby_data_pipeline",
packages=find_packages(exclude=["tabby_data_pipeline_tests"]),
install_requires=[
"dagster",
"dagster-cloud",
"dagstermill",
"papermill-origami>=0.0.8",
"pandas",
"matplotlib",
"seaborn",
"scikit-learn",
],
extras_require={"dev": ["dagster-webserver", "pytest"]},
)
17 changes: 17 additions & 0 deletions python/tabby-eval/tabby_data_pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from dagster import Definitions, load_assets_from_modules

from dagstermill import ConfigurableLocalOutputNotebookIOManager


from . import assets, create_csv

all_assets = load_assets_from_modules([assets, create_csv])

defs = Definitions(
assets=all_assets,
resources = {
"output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager()
}
)


78 changes: 78 additions & 0 deletions python/tabby-eval/tabby_data_pipeline/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import json


def get_bracket_lang_statement(completion):
end_idx = None
for i in range(len(completion)):
if completion[i] in [";", "{", "}"]:
end_idx = i
break
return completion[:end_idx+1] if end_idx else completion


def postprocess_code_lines(prompt, target, language):
try:
if language in ["java", "csharp", "typescript"]:
return get_bracket_lang_statement(target)
elif language == "python":
return target.split("\n")[0]
except Exception:
return target


def analyze(model, language, file):

line_match = 0
statement_match = 0

input_file = f"./data/{model}/{language}/{file}"
output_file = f"./data/{model}/{language}/result_{file}"

with open(output_file, 'w') as fout:
with open(input_file) as fin:
for line in fin:
obj = json.loads(line)
result = {}
prediction = ""

for k in obj.keys():
if k == "prediction":
prediction = str(obj[k])
break
elif k == "error":
break
else:
result[k] = obj[k]

tabby_eval = {}
if file == "line_completion.jsonl":
tabby_eval["raw_prompt"] = obj["prompt"]
else:
tabby_eval["raw_prompt"] = obj["crossfile_context"]["text"] + obj["prompt"]

tabby_eval["prediction"] = prediction

groundtruth = obj["groundtruth"]

tabby_eval["first_line_prediction"] = prediction.split("\n")[0]
tabby_eval["first_line_groundtruth"] = groundtruth.split("\n")[0]
if tabby_eval["first_line_prediction"] == tabby_eval["first_line_groundtruth"]:
tabby_eval["first_line_matched"] = True
line_match += 1
else:
tabby_eval["first_line_matched"] = False

tabby_eval["first_statement_prediction"] = postprocess_code_lines(tabby_eval["raw_prompt"], prediction, language)
tabby_eval["first_statement_groundtruth"] = postprocess_code_lines(tabby_eval["raw_prompt"], groundtruth, language)
if tabby_eval["first_statement_prediction"] == tabby_eval["first_statement_groundtruth"]:
tabby_eval["first_statement_matched"] = True
statement_match += 1
else:
tabby_eval["first_statement_matched"] = False

result["tabby_eval"] = tabby_eval

json.dump(result, fout)
fout.write("\n")


145 changes: 145 additions & 0 deletions python/tabby-eval/tabby_data_pipeline/assets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os
import subprocess

from dagster import (
AssetExecutionContext,
MetadataValue,
asset,
StaticPartitionsDefinition,
MultiPartitionsDefinition,
)
from . import analyze


@asset
def baseline() -> str:
return "line_completion.jsonl"

@asset
def bm25() -> str:
return "line_completion_rg1_bm25.jsonl"

@asset
def oracle() -> str:
return "line_completion_oracle_bm25.jsonl"

@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),

}
))
def predict_baseline(context: AssetExecutionContext, baseline: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

my_env = os.environ.copy()
my_env["MODEL_ID"] = model_id

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = baseline

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})

@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),

}
))
def predict_bm25(context: AssetExecutionContext, bm25: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

my_env = os.environ.copy()
my_env["MODEL_ID"] = model_id

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = bm25

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})


@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),

}
))
def predict_oracle(context: AssetExecutionContext, oracle: str) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]

my_env = os.environ.copy()
my_env["MODEL_ID"] = model_id

context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})

files = oracle

p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
p.wait()
context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})



@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[predict_baseline])
def matching_baseline(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
analyze.analyze(model, language, 'line_completion.jsonl')



@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[predict_bm25])
def matching_bm25(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
analyze.analyze(model, language, 'line_completion_rg1_bm25.jsonl')



@asset(
partitions_def=MultiPartitionsDefinition(
{
"model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
"language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
}
), deps=[predict_oracle])
def matching_oracle(context) -> None:
model_id = context.partition_key.keys_by_dimension["model_id"]
language = context.partition_key.keys_by_dimension["language"]


model = model_id.split("/")[-1]
analyze.analyze(model, language, 'line_completion_oracle_bm25.jsonl')
55 changes: 55 additions & 0 deletions python/tabby-eval/tabby_data_pipeline/create_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json
import pandas as pd

from dagster import (
asset,
AssetIn,
file_relative_path
)
from dagstermill import define_dagstermill_asset



models = ["StarCoder-1B", "StarCoder-3B", "StarCoder-7B", "CodeLlama-7B", "CodeLlama-13B", "WizardCoder-1B", "WizardCoder-3B", "DeepseekCoder-1.3B", "DeepseekCoder-6.7B"]
languages = {"csharp": "C#", "java": "Java", "python": "Python", "typescript": "Typescript"}
files = ["line_completion.jsonl", 'line_completion_rg1_bm25.jsonl', 'line_completion_oracle_bm25.jsonl']
total_records = {'python': 2665, 'java': 2139, 'typescript': 3356, 'csharp': 1768}

headers = ['Model', 'Dataset', 'Records', 'baseline', 'bm25', 'oracle']

stat = []
def get_match(model, language, file):
count = 0
with open(f"./data/{model}/{language}/result_{file}") as f:
for line in f:
obj = json.loads(line)
if obj["tabby_eval"]["first_line_matched"]:
count += 1

return count

@asset
def create_csv():
for model in models:
for language in languages.keys():
x = [model, languages[language], total_records[language]]
for f in files:
x.append(get_match(model, language, f))

stat.append(x)

df = pd.DataFrame(stat, columns=headers)
print(df)

df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)


@asset(deps=[create_csv])
def tabby_dataset():
return pd.read_csv(file_relative_path(__file__,'tabby.csv'))

tabby_jupyter_notebook = define_dagstermill_asset(
name = 'tabby_jupyter',
notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
ins={"df": AssetIn("tabby_dataset")},
)
Loading