Machine Learning Project Template
.
├── .dockerignore
├── .dvc
│ └── config
├── .dvcignore
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
│ └── settings.json
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── artefacts
├── bin
│ ├── build-dataset
│ ├── evaluate
│ └── train-model
├── dvc.lock
├── dvc.yaml
├── prophet
│ ├── __init__
│ ├── config.py
│ ├── data.py
│ ├── dataset_builder.py
│ ├── evaluator.py
│ ├── text_classifier.py
│ └── trainer.py
├── pyproject.toml
├── tests
│ └── test_text_classifier.py
└── uv.lock
Package Management and Tooling
uv
pyproject.toml
[project]
name = "prophet"
version = "0.1.0"
description = "Description for your ML project"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"dvc>=3.58.0",
"fastapi>=0.115.6",
"pandas>=2.2.3",
"uvicorn>=0.34.0",
]
[dependency-groups]
dev = [
"fastparquet>=2024.11.0",
"loguru>=0.7.3",
"pre-commit>=4.0.1",
"pytest>=8.3.4",
"ruff>=0.8.3",
]
[tool.ruff]
line-length = 100
target-version = "py312"
[tool.pytest.ini_options]
pythonpath = [
"."
]
[tool.setuptools]
packages = ["prophet"]
Reproducibility and Data Versioning
Linting and Formatting
Tests
Configuration
prophet/config.py
import os
import random
import sys
from pathlib import Path
import numpy as np
from loguru import logger
class Config:
SEED = 42
class Path:
APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent))
ARTEFACTS_DIR = APP_HOME / "artefacts"
DATA_DIR = ARTEFACTS_DIR / "data"
MODELS_DIR = ARTEFACTS_DIR / "models"
REPORTS_DIR = ARTEFACTS_DIR / "reports"
class Dataset:
TEST_SIZE = 0.2
TRAIN_FILE = "train.parquet"
TEST_FILE = "test.parquet"
class Model:
FILE_NAME = "model.json"
class Evaluation:
REPORT_FILE = "evaluation.json"
def seed_everything(seed: int = Config.SEED):
random.seed(seed)
np.random.seed(seed)
def configure_logging():
config = {
"handlers": [
{
"sink": sys.stdout,
"colorize": True,
"format": "<green>{time:YYYY-MM-DD - HH:mm:ss}</green> | <level>{level}</level> | {message}",
},
]
}
logger.configure(**config)
Data Preprocessing
prophet/data.py
from enum import Enum
class Label(str, Enum):
JUST_DO_IT = "just-do-it"
STOP = "stop"
EXAMPLES = [
{
"text": "let's launch the project immediately and make progress",
"label": Label.JUST_DO_IT
},
...
]
prophet/dataset_builder.py
from pathlib import Path
import pandas as pd
from loguru import logger
from prophet.config import Config
from prophet.data import EXAMPLES
def build_dataset(save_dir: Path) -> pd.DataFrame:
logger.debug(f"Building dataset at {save_dir}")
Config.Path.DATA_DIR.mkdir(parents=True, exist_ok=True)
rows = [{"text": r["text"], "label": r["label"].value} for r in EXAMPLES]
df = pd.DataFrame(rows)
df_train = df.sample(frac=1.0 - Config.Dataset.TEST_SIZE)
df_test = df.drop(df_train.index)
df_train.to_parquet(save_dir / Config.Dataset.TRAIN_FILE)
df_test.to_parquet(save_dir / Config.Dataset.TEST_FILE)
return df
bin/build-dataset
#!/usr/bin/env python
from prophet.config import Config, configure_logging, seed_everything
from prophet.dataset_builder import build_dataset
seed_everything()
configure_logging()
build_dataset(Config.Path.DATA_DIR)
Model Training
prophet/text_classifier.py
import json
from pathlib import Path
from typing import Set
import pandas as pd
from prophet.data import Label
class TextClassifier:
def __init__(self, positive_words: Set[str] = set(), negative_words: Set[str] = set()):
self.positive_words = positive_words
self.negative_words = negative_words
def fit(self, training_data):
df = pd.read_parquet(training_data)
self.positive_words = set()
self.negative_words = set()
for _, item in df.iterrows():
words = item.text.lower().split()
if item.label == Label.JUST_DO_IT:
self.positive_words.update(words)
else:
self.negative_words.update(words)
# Remove words that appear in both categories as they're not discriminative
common_words = self.positive_words.intersection(self.negative_words)
self.positive_words -= common_words
self.negative_words -= common_words
def predict(self, text) -> Label:
words = set(text.lower().split())
positive_matches = len(words.intersection(self.positive_words))
negative_matches = len(words.intersection(self.negative_words))
return Label.JUST_DO_IT if positive_matches > negative_matches else Label.STOP
def save(self, filepath: Path):
data = {
"positive_words": list(self.positive_words),
"negative_words": list(self.negative_words),
}
filepath.write_text(json.dumps(data))
@staticmethod
def load(filepath: Path) -> "TextClassifier":
data = json.loads(filepath.read_text())
return TextClassifier(set(data["positive_words"]), set(data["negative_words"]))
prophet/trainer.py
from pathlib import Path
from loguru import logger
from prophet.config import Config
from prophet.text_classifier import TextClassifier
def train(train_file: Path, save_dir: Path):
logger.debug("Model training started...")
Config.Path.MODELS_DIR.mkdir(parents=True, exist_ok=True)
model = TextClassifier()
model.fit(train_file)
model.save(save_dir / Config.Model.FILE_NAME)
logger.debug(f"Model training completed (saved at {save_dir / Config.Model.FILE_NAME})")
Evaluation
prophet/evaluator.py
import json
from pathlib import Path
import pandas as pd
from loguru import logger
from prophet.config import Config
from prophet.text_classifier import TextClassifier
def evaluate(test_data_file: Path, model_file: Path, report_path: Path) -> float:
df = pd.read_parquet(test_data_file)
logger.debug(f"Evaluating model with {len(df)} samples")
Config.Path.REPORTS_DIR.mkdir(parents=True, exist_ok=True)
classifier = TextClassifier.load(model_file)
correct = 0
for _, item in df.iterrows():
prediction = classifier.predict(item.text)
if prediction == item.label:
correct += 1
accuracy = correct / len(df)
metrics = {"accuracy": accuracy}
logger.debug(f"Model metrics: {metrics}")
report_path.write_text(json.dumps(metrics))
Model Serving
app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from prophet.config import Config
from prophet.data import Label
from prophet.text_classifier import TextClassifier
class PredictRequest(BaseModel):
text: str
class PredictResponse(BaseModel):
prediction: Label
text: str
app = FastAPI(
title="Prophet API",
description="Predict the label of a text - just-do-it or stop",
version="0.1.0",
)
model = TextClassifier.load(Config.Path.MODELS_DIR / Config.Model.FILE_NAME)
@app.post(
"/predict",
response_model=PredictResponse,
summary="Predict the label of a text",
)
async def predict(request: PredictRequest):
try:
prediction = model.predict(request.text)
return PredictResponse(prediction=Label(prediction), text=request.text)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
Dockerization
Dockerfile
FROM python:3.12-slim-bookworm
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Download the latest installer
ADD https://astral.sh/uv/install.sh /uv-installer.sh
# Run the installer then remove it
RUN sh /uv-installer.sh && rm /uv-installer.sh
# Ensure the installed binary is on the `PATH` and use environment
ENV PATH="/root/.local/bin/:$PATH"
ENV PATH="/app/.venv/bin:$PATH"
# Copy project files
COPY pyproject.toml README.md app.py uv.lock ./
COPY prophet prophet/
COPY artefacts artefacts/
# Install Python dependencies
RUN uv sync --frozen
# Expose port
EXPOSE 8000
# Run the FastAPI app
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]