From 7531e319619c515dfb4bab1317bdfd1d178f976c Mon Sep 17 00:00:00 2001 From: "compass.admin" Date: Thu, 23 Apr 2026 17:50:45 +0000 Subject: [PATCH] curator: chain compass-curate after ingest --- .gitea/workflows/ingest.yml | 80 ++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/ingest.yml b/.gitea/workflows/ingest.yml index 991bf1d..65fa310 100644 --- a/.gitea/workflows/ingest.yml +++ b/.gitea/workflows/ingest.yml @@ -21,6 +21,11 @@ jobs: # Same network override + volume allow-list as deploy.yml. See # scripts/runner-config.yaml for the rationale. network: ${{ vars.COMPASS_NETWORK || 'compass_default' }} + outputs: + # Space-separated list of UUIDs written by `compass-ingest`; the + # `curate` job fan-outs over these. Empty on no-op pushes. + document_ids: ${{ steps.run-ingest.outputs.document_ids }} + changed_count: ${{ steps.changes.outputs.changed_count }} env: # Pgvector chunk store. The CLI reads DATABASE_URL (ottomator # legacy name); POSTGRES_RAG_DSN is kept for symmetry with the @@ -79,6 +84,7 @@ jobs: cat /tmp/changed.txt || true - name: Ingest changed files + id: run-ingest if: steps.changes.outputs.changed_count != '0' shell: bash run: | @@ -87,7 +93,14 @@ jobs: # while still giving compass-ingest a single invocation so # the shared DB/graph pools are reused. mapfile -t files < /tmp/changed.txt - compass-ingest run --files "${files[@]}" + # `tee` so the human-readable log still shows the JSON summary + # while the file feeds the downstream id extraction. + compass-ingest run --files "${files[@]}" | tee /tmp/ingest-summary.json + # Extract UUIDs as space-separated string for the curate job + # fan-out. python3 is present on the compass-app image. + doc_ids="$(python3 -c 'import json,sys; d=json.load(open("/tmp/ingest-summary.json")); print(" ".join(r["document_id"] for r in d["results"] if not r["errors"]))')" + echo "document_ids=$doc_ids" >> "$GITHUB_OUTPUT" + echo "captured document_ids: $doc_ids" - name: No-op summary if: steps.changes.outputs.changed_count == '0' @@ -101,3 +114,68 @@ jobs: name: ingest-${{ github.sha }} path: | /tmp/changed.txt + /tmp/ingest-summary.json + + # Curator stage — chained synchronously after ingest. Runs the + # RegulatoryObligation spec against each newly-ingested document_id, + # opens a PR against compass/migrations per accepted document. Skips + # (classified_skip / extracted) are non-fatal; only a `failed` curator + # run fails the workflow. + curate: + needs: ingest + if: needs.ingest.outputs.document_ids != '' + runs-on: ubuntu-latest + container: + image: ${{ vars.COMPASS_MCP_IMAGE }} + network: ${{ vars.COMPASS_NETWORK || 'compass_default' }} + env: + # Curator reads chunks + writes agent_runs into postgres-rag. + POSTGRES_RAG_DSN: ${{ secrets.POSTGRES_RAG_DSN }} + # Raw Graphiti graph (7688) — `gather_for_document` reads + # episodes/entities traced back to the document. + NEO4J_RAW_URI: ${{ secrets.NEO4J_RAW_URI }} + NEO4J_RAW_USER: ${{ secrets.NEO4J_RAW_USER }} + NEO4J_RAW_PASSWORD: ${{ secrets.NEO4J_RAW_PASSWORD }} + # Structured graph (7687) — conflict-check reads the live schema. + # `client_from_env` reads NEO4J_URI / NEO4J_USER / NEO4J_PASSWORD. + NEO4J_URI: ${{ secrets.NEO4J_STRUCTURED_URI }} + NEO4J_USER: ${{ secrets.NEO4J_STRUCTURED_USER }} + NEO4J_PASSWORD: ${{ secrets.NEO4J_STRUCTURED_PASSWORD }} + # OpenAI for classify + extract + score LLM calls. + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_BASE_URL: ${{ vars.LLM_BASE_URL || 'https://api.openai.com/v1' }} + CURATOR_CLASSIFY_MODEL: ${{ vars.CURATOR_CLASSIFY_MODEL || 'gpt-4.1-mini' }} + CURATOR_EXTRACT_MODEL: ${{ vars.CURATOR_EXTRACT_MODEL || 'gpt-4.1-mini' }} + CURATOR_MIN_CONFIDENCE: ${{ vars.CURATOR_MIN_CONFIDENCE || '0.7' }} + # Gitea — the curator opens PRs against compass/migrations. + # Gitea rejects secret AND variable names prefixed with GITEA_ / + # GITHUB_, so both are stored as COMPASS_GITEA_*. We remap them + # back to GITEA_* here, which is what `GiteaConfig.from_env` reads. + GITEA_URL: ${{ vars.COMPASS_GITEA_URL || 'http://gitea:3000' }} + GITEA_TOKEN: ${{ secrets.COMPASS_GITEA_TOKEN }} + GITEA_OWNER: ${{ vars.COMPASS_GITEA_OWNER || 'compass' }} + GITEA_MIGRATIONS_REPO: ${{ vars.COMPASS_MIGRATIONS_REPO || 'migrations' }} + + steps: + - name: Run curator per document + shell: bash + run: | + set -euo pipefail + any_failed=0 + for doc_id in ${{ needs.ingest.outputs.document_ids }}; do + echo "::group::curate $doc_id" + # Don't let a single-document failure abort the loop — we + # want all docs attempted so a poisoned PDF doesn't block + # the rest of the batch. The job exits non-zero at the end + # if any run returned 1 (status=failed). + set +e + compass-curate run --document-id "$doc_id" + rc=$? + set -e + if [ "$rc" = "1" ]; then + echo "::error::curator failed for $doc_id" + any_failed=1 + fi + echo "::endgroup::" + done + exit "$any_failed"