Skip to Content
BootcampMachine Learning Project Template

Machine Learning Project Template

. ├── .dockerignore ├── .dvc │   └── config ├── .dvcignore ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode │   └── settings.json ├── Dockerfile ├── LICENSE ├── README.md ├── app.py ├── artefacts ├── bin │   ├── build-dataset │   ├── evaluate │   └── train-model ├── dvc.lock ├── dvc.yaml ├── prophet │   ├── __init__ │   ├── config.py │   ├── data.py │   ├── dataset_builder.py │   ├── evaluator.py │   ├── text_classifier.py │   └── trainer.py ├── pyproject.toml ├── tests │   └── test_text_classifier.py └── uv.lock

Package Management and Tooling

uv

pyproject.toml
[project] name = "prophet" version = "0.1.0" description = "Description for your ML project" readme = "README.md" requires-python = ">=3.12" dependencies = [ "dvc>=3.58.0", "fastapi>=0.115.6", "pandas>=2.2.3", "uvicorn>=0.34.0", ] [dependency-groups] dev = [ "fastparquet>=2024.11.0", "loguru>=0.7.3", "pre-commit>=4.0.1", "pytest>=8.3.4", "ruff>=0.8.3", ] [tool.ruff] line-length = 100 target-version = "py312" [tool.pytest.ini_options] pythonpath = [ "." ] [tool.setuptools] packages = ["prophet"]

Reproducibility and Data Versioning

Linting and Formatting

Tests

Configuration

prophet/config.py
import os import random import sys from pathlib import Path import numpy as np from loguru import logger class Config: SEED = 42 class Path: APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent)) ARTEFACTS_DIR = APP_HOME / "artefacts" DATA_DIR = ARTEFACTS_DIR / "data" MODELS_DIR = ARTEFACTS_DIR / "models" REPORTS_DIR = ARTEFACTS_DIR / "reports" class Dataset: TEST_SIZE = 0.2 TRAIN_FILE = "train.parquet" TEST_FILE = "test.parquet" class Model: FILE_NAME = "model.json" class Evaluation: REPORT_FILE = "evaluation.json" def seed_everything(seed: int = Config.SEED): random.seed(seed) np.random.seed(seed) def configure_logging(): config = { "handlers": [ { "sink": sys.stdout, "colorize": True, "format": "<green>{time:YYYY-MM-DD - HH:mm:ss}</green> | <level>{level}</level> | {message}", }, ] } logger.configure(**config)

Data Preprocessing

prophet/data.py
from enum import Enum class Label(str, Enum): JUST_DO_IT = "just-do-it" STOP = "stop" EXAMPLES = [ { "text": "let's launch the project immediately and make progress", "label": Label.JUST_DO_IT }, ... ]
prophet/dataset_builder.py
from pathlib import Path import pandas as pd from loguru import logger from prophet.config import Config from prophet.data import EXAMPLES def build_dataset(save_dir: Path) -> pd.DataFrame: logger.debug(f"Building dataset at {save_dir}") Config.Path.DATA_DIR.mkdir(parents=True, exist_ok=True) rows = [{"text": r["text"], "label": r["label"].value} for r in EXAMPLES] df = pd.DataFrame(rows) df_train = df.sample(frac=1.0 - Config.Dataset.TEST_SIZE) df_test = df.drop(df_train.index) df_train.to_parquet(save_dir / Config.Dataset.TRAIN_FILE) df_test.to_parquet(save_dir / Config.Dataset.TEST_FILE) return df
bin/build-dataset
#!/usr/bin/env python from prophet.config import Config, configure_logging, seed_everything from prophet.dataset_builder import build_dataset seed_everything() configure_logging() build_dataset(Config.Path.DATA_DIR)

Model Training

prophet/text_classifier.py
import json from pathlib import Path from typing import Set import pandas as pd from prophet.data import Label class TextClassifier: def __init__(self, positive_words: Set[str] = set(), negative_words: Set[str] = set()): self.positive_words = positive_words self.negative_words = negative_words def fit(self, training_data): df = pd.read_parquet(training_data) self.positive_words = set() self.negative_words = set() for _, item in df.iterrows(): words = item.text.lower().split() if item.label == Label.JUST_DO_IT: self.positive_words.update(words) else: self.negative_words.update(words) # Remove words that appear in both categories as they're not discriminative common_words = self.positive_words.intersection(self.negative_words) self.positive_words -= common_words self.negative_words -= common_words def predict(self, text) -> Label: words = set(text.lower().split()) positive_matches = len(words.intersection(self.positive_words)) negative_matches = len(words.intersection(self.negative_words)) return Label.JUST_DO_IT if positive_matches > negative_matches else Label.STOP def save(self, filepath: Path): data = { "positive_words": list(self.positive_words), "negative_words": list(self.negative_words), } filepath.write_text(json.dumps(data)) @staticmethod def load(filepath: Path) -> "TextClassifier": data = json.loads(filepath.read_text()) return TextClassifier(set(data["positive_words"]), set(data["negative_words"]))
prophet/trainer.py
from pathlib import Path from loguru import logger from prophet.config import Config from prophet.text_classifier import TextClassifier def train(train_file: Path, save_dir: Path): logger.debug("Model training started...") Config.Path.MODELS_DIR.mkdir(parents=True, exist_ok=True) model = TextClassifier() model.fit(train_file) model.save(save_dir / Config.Model.FILE_NAME) logger.debug(f"Model training completed (saved at {save_dir / Config.Model.FILE_NAME})")

Evaluation

prophet/evaluator.py
import json from pathlib import Path import pandas as pd from loguru import logger from prophet.config import Config from prophet.text_classifier import TextClassifier def evaluate(test_data_file: Path, model_file: Path, report_path: Path) -> float: df = pd.read_parquet(test_data_file) logger.debug(f"Evaluating model with {len(df)} samples") Config.Path.REPORTS_DIR.mkdir(parents=True, exist_ok=True) classifier = TextClassifier.load(model_file) correct = 0 for _, item in df.iterrows(): prediction = classifier.predict(item.text) if prediction == item.label: correct += 1 accuracy = correct / len(df) metrics = {"accuracy": accuracy} logger.debug(f"Model metrics: {metrics}") report_path.write_text(json.dumps(metrics))

Model Serving

app.py
from fastapi import FastAPI, HTTPException from pydantic import BaseModel from prophet.config import Config from prophet.data import Label from prophet.text_classifier import TextClassifier class PredictRequest(BaseModel): text: str class PredictResponse(BaseModel): prediction: Label text: str app = FastAPI( title="Prophet API", description="Predict the label of a text - just-do-it or stop", version="0.1.0", ) model = TextClassifier.load(Config.Path.MODELS_DIR / Config.Model.FILE_NAME) @app.post( "/predict", response_model=PredictResponse, summary="Predict the label of a text", ) async def predict(request: PredictRequest): try: prediction = model.predict(request.text) return PredictResponse(prediction=Label(prediction), text=request.text) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)

Dockerization

Dockerfile
FROM python:3.12-slim-bookworm # Set working directory WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ curl \ && rm -rf /var/lib/apt/lists/* # Download the latest installer ADD https://astral.sh/uv/install.sh /uv-installer.sh # Run the installer then remove it RUN sh /uv-installer.sh && rm /uv-installer.sh # Ensure the installed binary is on the `PATH` and use environment ENV PATH="/root/.local/bin/:$PATH" ENV PATH="/app/.venv/bin:$PATH" # Copy project files COPY pyproject.toml README.md app.py uv.lock ./ COPY prophet prophet/ COPY artefacts artefacts/ # Install Python dependencies RUN uv sync --frozen # Expose port EXPOSE 8000 # Run the FastAPI app CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]