BootcampMachine Learning Project Template

Machine Learning Project Template

.
├── .dockerignore
├── .dvc
│   └── config
├── .dvcignore
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
│   └── settings.json
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── artefacts
├── bin
│   ├── build-dataset
│   ├── evaluate
│   └── train-model
├── dvc.lock
├── dvc.yaml
├── prophet
│   ├── __init__
│   ├── config.py
│   ├── data.py
│   ├── dataset_builder.py
│   ├── evaluator.py
│   ├── text_classifier.py
│   └── trainer.py
├── pyproject.toml
├── tests
│   └── test_text_classifier.py
└── uv.lock

Package Management and Tooling

uv

pyproject.toml
[project]
name = "prophet"
version = "0.1.0"
description = "Description for your ML project"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "dvc>=3.58.0",
    "fastapi>=0.115.6",
    "pandas>=2.2.3",
    "uvicorn>=0.34.0",
]
 
[dependency-groups]
dev = [
    "fastparquet>=2024.11.0",
    "loguru>=0.7.3",
    "pre-commit>=4.0.1",
    "pytest>=8.3.4",
    "ruff>=0.8.3",
]
 
[tool.ruff]
line-length = 100
target-version = "py312"
 
[tool.pytest.ini_options]
pythonpath = [
  "."
]
 
[tool.setuptools]
packages = ["prophet"]

Reproducibility and Data Versioning

Linting and Formatting

Tests

Configuration

prophet/config.py
import os
import random
import sys
from pathlib import Path
 
import numpy as np
from loguru import logger
 
 
class Config:
    SEED = 42
 
    class Path:
        APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent))
        ARTEFACTS_DIR = APP_HOME / "artefacts"
        DATA_DIR = ARTEFACTS_DIR / "data"
        MODELS_DIR = ARTEFACTS_DIR / "models"
        REPORTS_DIR = ARTEFACTS_DIR / "reports"
 
    class Dataset:
        TEST_SIZE = 0.2
        TRAIN_FILE = "train.parquet"
        TEST_FILE = "test.parquet"
 
    class Model:
        FILE_NAME = "model.json"
 
    class Evaluation:
        REPORT_FILE = "evaluation.json"
 
 
def seed_everything(seed: int = Config.SEED):
    random.seed(seed)
    np.random.seed(seed)
 
 
def configure_logging():
    config = {
        "handlers": [
            {
                "sink": sys.stdout,
                "colorize": True,
                "format": "<green>{time:YYYY-MM-DD - HH:mm:ss}</green> | <level>{level}</level> | {message}",
            },
        ]
    }
    logger.configure(**config)

Data Preprocessing

prophet/data.py
from enum import Enum
 
 
class Label(str, Enum):
    JUST_DO_IT = "just-do-it"
    STOP = "stop"
 
 
EXAMPLES = [
    {
        "text": "let's launch the project immediately and make progress", 
        "label": Label.JUST_DO_IT
    },
    ...
]
 
prophet/dataset_builder.py
from pathlib import Path
 
import pandas as pd
from loguru import logger
 
from prophet.config import Config
from prophet.data import EXAMPLES
 
 
def build_dataset(save_dir: Path) -> pd.DataFrame:
    logger.debug(f"Building dataset at {save_dir}")
    Config.Path.DATA_DIR.mkdir(parents=True, exist_ok=True)
    rows = [{"text": r["text"], "label": r["label"].value} for r in EXAMPLES]
    df = pd.DataFrame(rows)
    df_train = df.sample(frac=1.0 - Config.Dataset.TEST_SIZE)
    df_test = df.drop(df_train.index)
    df_train.to_parquet(save_dir / Config.Dataset.TRAIN_FILE)
    df_test.to_parquet(save_dir / Config.Dataset.TEST_FILE)
    return df
bin/build-dataset
#!/usr/bin/env python
 
from prophet.config import Config, configure_logging, seed_everything
from prophet.dataset_builder import build_dataset
 
seed_everything()
configure_logging()
 
 
build_dataset(Config.Path.DATA_DIR)

Model Training

prophet/text_classifier.py
import json
from pathlib import Path
from typing import Set
 
import pandas as pd
 
from prophet.data import Label
 
 
class TextClassifier:
    def __init__(self, positive_words: Set[str] = set(), negative_words: Set[str] = set()):
        self.positive_words = positive_words
        self.negative_words = negative_words
 
    def fit(self, training_data):
        df = pd.read_parquet(training_data)
        self.positive_words = set()
        self.negative_words = set()
 
        for _, item in df.iterrows():
            words = item.text.lower().split()
            if item.label == Label.JUST_DO_IT:
                self.positive_words.update(words)
            else:
                self.negative_words.update(words)
 
        # Remove words that appear in both categories as they're not discriminative
        common_words = self.positive_words.intersection(self.negative_words)
        self.positive_words -= common_words
        self.negative_words -= common_words
 
    def predict(self, text) -> Label:
        words = set(text.lower().split())
 
        positive_matches = len(words.intersection(self.positive_words))
        negative_matches = len(words.intersection(self.negative_words))
 
        return Label.JUST_DO_IT if positive_matches > negative_matches else Label.STOP
 
    def save(self, filepath: Path):
        data = {
            "positive_words": list(self.positive_words),
            "negative_words": list(self.negative_words),
        }
        filepath.write_text(json.dumps(data))
 
    @staticmethod
    def load(filepath: Path) -> "TextClassifier":
        data = json.loads(filepath.read_text())
        return TextClassifier(set(data["positive_words"]), set(data["negative_words"]))
prophet/trainer.py
from pathlib import Path
 
from loguru import logger
 
from prophet.config import Config
from prophet.text_classifier import TextClassifier
 
 
def train(train_file: Path, save_dir: Path):
    logger.debug("Model training started...")
    Config.Path.MODELS_DIR.mkdir(parents=True, exist_ok=True)
    model = TextClassifier()
    model.fit(train_file)
    model.save(save_dir / Config.Model.FILE_NAME)
    logger.debug(f"Model training completed (saved at {save_dir / Config.Model.FILE_NAME})")

Evaluation

prophet/evaluator.py
import json
from pathlib import Path
 
import pandas as pd
from loguru import logger
 
from prophet.config import Config
from prophet.text_classifier import TextClassifier
 
 
def evaluate(test_data_file: Path, model_file: Path, report_path: Path) -> float:
    df = pd.read_parquet(test_data_file)
    logger.debug(f"Evaluating model with {len(df)} samples")
    Config.Path.REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    classifier = TextClassifier.load(model_file)
    correct = 0
    for _, item in df.iterrows():
        prediction = classifier.predict(item.text)
        if prediction == item.label:
            correct += 1
    accuracy = correct / len(df)
    metrics = {"accuracy": accuracy}
    logger.debug(f"Model metrics: {metrics}")
    report_path.write_text(json.dumps(metrics))

Model Serving

app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
 
from prophet.config import Config
from prophet.data import Label
from prophet.text_classifier import TextClassifier
 
 
class PredictRequest(BaseModel):
    text: str
 
 
class PredictResponse(BaseModel):
    prediction: Label
    text: str
 
 
app = FastAPI(
    title="Prophet API",
    description="Predict the label of a text - just-do-it or stop",
    version="0.1.0",
)
model = TextClassifier.load(Config.Path.MODELS_DIR / Config.Model.FILE_NAME)
 
 
@app.post(
    "/predict",
    response_model=PredictResponse,
    summary="Predict the label of a text",
)
async def predict(request: PredictRequest):
    try:
        prediction = model.predict(request.text)
        return PredictResponse(prediction=Label(prediction), text=request.text)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
 
 
if __name__ == "__main__":
    import uvicorn
 
    uvicorn.run(app, host="0.0.0.0", port=8000)

Dockerization

Dockerfile
FROM python:3.12-slim-bookworm
 
# Set working directory
WORKDIR /app
 
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    curl \
    && rm -rf /var/lib/apt/lists/*
 
# Download the latest installer
ADD https://astral.sh/uv/install.sh /uv-installer.sh
 
# Run the installer then remove it
RUN sh /uv-installer.sh && rm /uv-installer.sh
 
# Ensure the installed binary is on the `PATH` and use environment
ENV PATH="/root/.local/bin/:$PATH"
ENV PATH="/app/.venv/bin:$PATH"
 
# Copy project files
 
COPY pyproject.toml README.md app.py uv.lock ./
COPY prophet prophet/
COPY artefacts artefacts/
 
# Install Python dependencies
RUN uv sync --frozen
 
# Expose port
EXPOSE 8000
 
# Run the FastAPI app
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]