File size: 2,658 Bytes
0f31e57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/bin/bash
# Reproduce the best checkpoint in deterministic mode (bit-reproducible).
#
# Same three stages as reproduce.sh, but with --deterministic:
# 1. Train on 2048-point data (~3hr on 1x RTX 4090)
# 2. Finetune on 4096-point data (~30min)
# 3. Cooldown with endpoint loss (~2hr)
#
# Total: ~5.5hr on a single GPU (no torch.compile, ~2x slower than reproduce.sh).
# Deterministic mode disables torch.compile and forces CUDA deterministic ops.
# Results are bit-identical across runs with the same seed. Expected HSS ~0.372.
set -e
OUT_DIR="${1:-runs}"
BASE="--args-from configs/base.json"
# ============================================================
# Step 1: Train on 2048-point data (Phase 1)
# ============================================================
echo "=== Step 1: Training on 2048 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \
--seq-len 2048 \
--lr 3e-4 \
--batch-size 32 \
--steps 125000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 1 complete: $STEP1_DIR"
# ============================================================
# Step 2: Finetune on 4096-point data
# ============================================================
echo "=== Step 2: Finetuning on 4096 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
--resume "$STEP1_DIR/checkpoints/step125000.pt" \
--seq-len 4096 \
--lr 3e-5 \
--batch-size 64 \
--steps 135000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 2 complete: $STEP2_DIR"
# ============================================================
# Step 3: Cooldown with endpoint loss
# ============================================================
echo "=== Step 3: Cooldown with endpoint loss (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
--resume "$STEP2_DIR/checkpoints/step135000.pt" \
--seq-len 4096 \
--lr 3e-5 \
--batch-size 64 \
--endpoint-weight 0.1 \
--cooldown-start 150000 \
--cooldown-steps 20000 \
--steps 170000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 3 complete: $STEP3_DIR"
echo ""
echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt"
echo "Copy to checkpoint.pt for submission."
|