Skip to main content

Multi-Manifest Submission

Given a directory with multiple AIchor manifest files, launches an experiment per manifest.

Uses the files in the HEAD commit.

Checks to see if the manifests directory has been commited and if the commited code is up to date

Waits for the first build step in the AIchor workflow to complete before submitting the rest of the experiments so that subsequent build jobs benefit from the Docker layer cache.

Usage

./03-multi-manifest-submission.sh [--manifests-dir PATH] [--project-name NAME] [--engine-name NAME]

Full Script

03-multi-manifest-submission.sh
#!/usr/bin/env bash
# ==============================================================================
# 03-multi-manifest-submission.sh
#
# Submits multiple experiments from a manifests/ directory, each using the
# current HEAD commit via the commit-sha submission path.
#
# STRATEGY — why we wait for the first build:
# The first submission builds a Docker image from scratch and pushes it to the
# registry. All subsequent experiments share the same base image and skip the
# slow build phase (cache hit). Submitting all experiments at once before the
# first build completes means every job queues a separate build, multiplying
# build time. Wait for build to finish first, then fire off the rest.
#
# Usage:
# ./03-multi-manifest-submission.sh [options]
#
# Options:
# --manifests-dir PATH Directory containing manifest files (default: manifests/)
# --project-name NAME Override the project from CLI context
# --engine-name NAME Override the engine from CLI context
# --branch NAME Git branch to use (default: current branch)
# --commit-sha SHA Commit SHA to use (default: HEAD)
# --build-timeout SECS Max seconds to wait for first build (default: 1800)
#
# Environment variables (used as fallbacks):
# AICHOR_API_KEY
# AICHOR_BRANCH
#
# Prerequisites:
# - manifests/ directory exists in the repo root (or pass --manifests-dir)
# - All referenced manifest changes are committed and pushed to the remote
# - aichor CLI is authenticated
# ==============================================================================

set -eo pipefail

# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
MANIFESTS_DIR="manifests/"
PROJECT_NAME=""
ENGINE_NAME=""
BRANCH="${AICHOR_BRANCH:-$(git branch --show-current)}"
COMMIT_SHA="$(git rev-parse HEAD)"
BUILD_TIMEOUT=1800 # 30 min — adjust for large images
BUILD_POLL_INTERVAL=30

# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
--manifests-dir) MANIFESTS_DIR="$2"; shift 2 ;;
--project-name) PROJECT_NAME="$2"; shift 2 ;;
--engine-name) ENGINE_NAME="$2"; shift 2 ;;
--branch) BRANCH="$2"; shift 2 ;;
--commit-sha) COMMIT_SHA="$2"; shift 2 ;;
--build-timeout) BUILD_TIMEOUT="$2"; shift 2 ;;
-h|--help)
sed -n '2,/^# =\+$/p' "$0" | grep '^#' | sed 's/^# \?//'
exit 0
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
section() {
echo
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " $*"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
}

cli_flags=()
project_flags=()
[[ -n "$PROJECT_NAME" ]] && cli_flags+=(--project-name "$PROJECT_NAME") && project_flags+=(--project-name "$PROJECT_NAME")
[[ -n "$ENGINE_NAME" ]] && cli_flags+=(--engine-name "$ENGINE_NAME")

# Get the current status of an experiment as a lowercase string.
# Returns "unknown" if the query fails or the experiment is not yet visible.
get_experiment_status() {
local exp_id="$1"
aichor experiments status "$exp_id" "${project_flags[@]}" 2>/dev/null \
| jq -r '.experiment_status // "unknown"' \
|| echo "unknown"
}

# Submit a single experiment by manifest path.
# Echoes only the experiment ID to stdout.
submit_manifest() {
local manifest_path="$1"
aichor experiments submit commit-sha "$COMMIT_SHA" \
--branch "$BRANCH" \
--manifest-path "$manifest_path" \
"${cli_flags[@]}" \
| jq -r '.experiment_id'
}

# Poll until the first experiment's build step is done.
# Possible steps: Waiting, Cloning, Building, Submitting, Running, Completed
wait_for_build_complete() {
local exp_id="$1"
local elapsed=0

echo "Polling build step for experiment $exp_id..."
echo "(poll interval: ${BUILD_POLL_INTERVAL}s, timeout: ${BUILD_TIMEOUT}s)"

while true; do
local step
step=$(aichor experiments step "$exp_id" "${project_flags[@]}" 2>/dev/null \
| jq -r '.experiment_step // "unknown"' \
|| echo "unknown")

case "$step" in
# Build is done — image is cached, safe to submit the rest
Submitting|Running|Completed)
echo " Build complete — step: $step"
return 0
;;
# Still building
Waiting|Cloning|Building)
printf " [%4ds] step: %-15s — waiting...\n" "$elapsed" "$step"
;;
unknown|*)
# Experiment not yet visible — keep waiting
printf " [%4ds] step: %-15s — retrying...\n" "$elapsed" "$step"
;;
esac

if [[ $elapsed -ge $BUILD_TIMEOUT ]]; then
echo "ERROR: Timed out after ${BUILD_TIMEOUT}s waiting for build of $exp_id." >&2
echo "You can monitor it manually:" >&2
echo " aichor experiments logs stream $exp_id" >&2
return 1
fi

sleep "$BUILD_POLL_INTERVAL"
elapsed=$(( elapsed + BUILD_POLL_INTERVAL ))
done
}

# ---------------------------------------------------------------------------
# 1. Authentication check
# ---------------------------------------------------------------------------
section "1. Authentication check"
if ! aichor projects list --output json &>/dev/null; then
echo "Not authenticated. Logging in..."
aichor auth key --apikey "${AICHOR_API_KEY:?AICHOR_API_KEY not set}"
fi
echo "Authenticated."

# ---------------------------------------------------------------------------
# 2. Validate manifests directory
# ---------------------------------------------------------------------------
section "2. Discovering manifests in: $MANIFESTS_DIR"

if [[ ! -d "$MANIFESTS_DIR" ]]; then
echo "ERROR: '$MANIFESTS_DIR' not found. Run this script from the repo root," >&2
echo "or use --manifests-dir to specify the correct path." >&2
exit 1
fi

# Collect all .yaml and .yml files up to 2 levels deep, sorted for consistency
MANIFESTS=()
while IFS= read -r line; do
MANIFESTS+=("$line")
done < <(find "$MANIFESTS_DIR" -maxdepth 2 \( -name "*.yaml" -o -name "*.yml" \) | sort)

if [[ ${#MANIFESTS[@]} -eq 0 ]]; then
echo "ERROR: No .yaml or .yml files found in $MANIFESTS_DIR" >&2
exit 1
fi

echo "Found ${#MANIFESTS[@]} manifest(s):"
for m in "${MANIFESTS[@]}"; do
echo " $m"
done

# ---------------------------------------------------------------------------
# 3. Git state check (ensure manifests are commited and experiment code is up to date)
# ---------------------------------------------------------------------------
section "3. Git state"
echo "Branch: $BRANCH"
echo "Commit SHA: $COMMIT_SHA"
echo "Message: $(git log -1 --pretty='%s' "$COMMIT_SHA")"
echo "Author: $(git log -1 --pretty='%an' "$COMMIT_SHA")"
echo

# Check for uncommitted changes
if ! git diff --quiet HEAD 2>/dev/null; then
echo "You have uncommitted changes:"
echo
git status --short
echo
read -r -p "Do you want to commit them before submitting? [y/N] " response
if [[ $(echo "$response" | tr '[:upper:]' '[:lower:]') == "y" ]]; then
read -r -p "Add all files or specific ones? [all/specific] " add_mode
if [[ $(echo "$add_mode" | tr '[:upper:]' '[:lower:]') == "specific" ]]; then
read -r -p "Enter file paths (space-separated): " files_input
read -ra files_to_add <<< "$files_input"
git add "${files_to_add[@]}"
else
git add -A
fi
read -r -p "Commit message: " commit_message
git commit -m "$commit_message"
COMMIT_SHA="$(git rev-parse HEAD)"
echo "Committed. New SHA: $COMMIT_SHA"
echo
echo "Pushing to origin/$BRANCH..."
git push origin "$BRANCH"
echo
else
read -r -p "Continue with the current committed state at $COMMIT_SHA? [y/N] " continue_response
if [[ $(echo "$continue_response" | tr '[:upper:]' '[:lower:]') != "y" ]]; then
echo "Aborting."
exit 0
fi
# Make sure the manifests dir exists in the committed tree — if it's new/untracked
# the remote won't have it and the submission will fail
if ! git ls-files "$MANIFESTS_DIR" | grep -q .; then
echo "ERROR: '$MANIFESTS_DIR' has no tracked files in the committed state."
echo " The remote does not have this directory. Commit your changes first."
exit 1
fi
echo "Proceeding with committed state at $COMMIT_SHA (uncommitted changes excluded)."
echo
fi
fi

# Check that the commit is present on the remote (required for commit-sha path)
if ! git ls-remote --exit-code origin "$BRANCH" &>/dev/null; then
echo "WARNING: Branch '$BRANCH' may not be pushed to origin yet."
echo " Run: git push origin $BRANCH"
echo
fi

# ---------------------------------------------------------------------------
# 4. Submit first manifest (primes the Docker build cache)
# ---------------------------------------------------------------------------
FIRST_MANIFEST="${MANIFESTS[0]}"
REMAINING_MANIFESTS=("${MANIFESTS[@]:1}")

section "4. Submitting first manifest: $FIRST_MANIFEST"
echo "This submission will build the Docker image. Subsequent submissions will"
echo "reuse the cached image and finish the image build much faster."
echo

FIRST_ID=$(submit_manifest "$FIRST_MANIFEST")
echo
echo "First experiment ID: $FIRST_ID"

# ---------------------------------------------------------------------------
# 5. Wait for build before submitting the rest
# ---------------------------------------------------------------------------
if [[ ${#REMAINING_MANIFESTS[@]} -gt 0 ]]; then
section "5. Waiting for build cache to warm up"
wait_for_build_complete "$FIRST_ID"
fi

# ---------------------------------------------------------------------------
# 6. Submit remaining manifests
# ---------------------------------------------------------------------------
# Parallel indexed array — EXPERIMENT_ID_LIST[i] is the ID for MANIFESTS[i]
EXPERIMENT_ID_LIST=()
EXPERIMENT_ID_LIST[0]="$FIRST_ID"

if [[ ${#REMAINING_MANIFESTS[@]} -gt 0 ]]; then
section "6. Submitting remaining ${#REMAINING_MANIFESTS[@]} manifest(s)"
echo "Build cache is warm — these should skip the build step."
echo

for i in "${!REMAINING_MANIFESTS[@]}"; do
manifest="${REMAINING_MANIFESTS[$i]}"
echo "Submitting: $manifest"
id=$(submit_manifest "$manifest")
EXPERIMENT_ID_LIST[$((i + 1))]="$id"
echo " -> Experiment ID: $id"
echo

# Brief pause between submissions to avoid rate-limiting
sleep 2
done
else
echo
echo "(Only one manifest found — no additional submissions needed.)"
fi

# ---------------------------------------------------------------------------
# 7. Summary
# ---------------------------------------------------------------------------
section "7. Summary — all submitted experiments"
printf '%-55s %s\n' "Manifest" "Experiment ID"
printf '%-55s %s\n' "$(printf '%.0s─' {1..55})" "$(printf '%.0s─' {1..30})"
for i in "${!MANIFESTS[@]}"; do
id="${EXPERIMENT_ID_LIST[$i]:-ERROR}"
printf '%-55s %s\n' "${MANIFESTS[$i]}" "$id"
done

echo
echo "Monitor individual experiments:"
for i in "${!MANIFESTS[@]}"; do
id="${EXPERIMENT_ID_LIST[$i]:-}"
[[ -n "$id" ]] && echo " aichor experiments logs stream $id # $(basename "${MANIFESTS[$i]}")"
done

echo
echo "List all experiments:"
echo " aichor experiments list --output table"