name: ingest # Fires on every push to main. Computes the diff against the parent # commit, filters to supported extensions, then invokes `compass-ingest # run --files ` inside a container pulled from # ${{ vars.COMPASS_APP_IMAGE }}. # # The CLI talks directly to postgres-rag + the raw Graphiti Neo4j — # there is no HTTP hop through the long-running app service. on: push: branches: - main jobs: ingest: runs-on: ubuntu-latest container: image: ${{ vars.COMPASS_APP_IMAGE }} # Same network override + volume allow-list as deploy.yml. See # scripts/runner-config.yaml for the rationale. network: ${{ vars.COMPASS_NETWORK || 'compass_default' }} outputs: # Space-separated list of UUIDs written by `compass-ingest`; the # `curate` job fan-outs over these. Empty on no-op pushes. document_ids: ${{ steps.run-ingest.outputs.document_ids }} changed_count: ${{ steps.changes.outputs.changed_count }} env: # Pgvector chunk store. The CLI reads DATABASE_URL (ottomator # legacy name); POSTGRES_RAG_DSN is kept for symmetry with the # long-running app service env and the runbook. DATABASE_URL: ${{ secrets.POSTGRES_RAG_DSN }} POSTGRES_RAG_DSN: ${{ secrets.POSTGRES_RAG_DSN }} # Raw Graphiti graph (7688). These are NOT the structured-graph # write creds — those only exist in compass/migrations. NEO4J_URI: ${{ secrets.NEO4J_RAW_URI }} NEO4J_USER: ${{ secrets.NEO4J_RAW_USER }} NEO4J_PASSWORD: ${{ secrets.NEO4J_RAW_PASSWORD }} # LLM + embedding creds (Graphiti extraction + pgvector embed step). # Re-used across both the free-form graph build and the chunker. LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_CHOICE: ${{ vars.LLM_CHOICE || 'gpt-4o-mini' }} EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL || 'text-embedding-3-small' }} steps: - name: Check out knowledge-base repo uses: actions/checkout@v4 with: # Depth 2 gives us $GITHUB_SHA and its parent so `git diff` # can run. The initial-commit path (parent == 000…) is # handled below. fetch-depth: 2 - name: Compute changed files id: changes shell: bash run: | set -euo pipefail BEFORE="${{ github.event.before }}" AFTER="${{ github.sha }}" # Gitea (like GitHub) reports 40 zeros as the "before" of an # initial push or a branch creation. Fall back to the full # file listing so we don't silently skip the first batch. if [ -z "$BEFORE" ] || [ "$BEFORE" = "0000000000000000000000000000000000000000" ]; then echo "initial commit or branch creation — ingesting full tree" git ls-files > /tmp/changed-all.txt else # `--diff-filter=d` drops deletions: we don't want to ingest # a file that no longer exists. git diff --name-only --diff-filter=d "$BEFORE".."$AFTER" > /tmp/changed-all.txt fi # Filter to supported extensions. Keep this list in sync with # the matcher in app/ingestion/pipeline.py. grep -Ei '\.(md|txt|pdf|docx|xlsx)$' /tmp/changed-all.txt > /tmp/changed.txt || true count=$(wc -l < /tmp/changed.txt | tr -d ' ') echo "changed_count=$count" >> "$GITHUB_OUTPUT" echo "changed files:" cat /tmp/changed.txt || true - name: Ingest changed files id: run-ingest if: steps.changes.outputs.changed_count != '0' shell: bash run: | set -euo pipefail # xargs feed avoids "argument list too long" on large pushes # while still giving compass-ingest a single invocation so # the shared DB/graph pools are reused. mapfile -t files < /tmp/changed.txt # `tee` so the human-readable log still shows the JSON summary # while the file feeds the downstream id extraction. compass-ingest run --files "${files[@]}" | tee /tmp/ingest-summary.json # Extract UUIDs as space-separated string for the curate job # fan-out. python3 is present on the compass-app image. doc_ids="$(python3 -c 'import json,sys; d=json.load(open("/tmp/ingest-summary.json")); print(" ".join(r["document_id"] for r in d["results"] if not r["errors"]))')" echo "document_ids=$doc_ids" >> "$GITHUB_OUTPUT" echo "captured document_ids: $doc_ids" - name: No-op summary if: steps.changes.outputs.changed_count == '0' run: echo "No supported files changed — nothing to ingest." - name: Upload ingest log if: always() # v4 uses GHES Artifacts API v2 which Gitea doesn't implement yet. uses: actions/upload-artifact@v3 with: name: ingest-${{ github.sha }} path: | /tmp/changed.txt /tmp/ingest-summary.json # Curator stage — chained synchronously after ingest. Runs the # RegulatoryObligation spec against each newly-ingested document_id, # opens a PR against compass/migrations per accepted document. Skips # (classified_skip / extracted) are non-fatal; only a `failed` curator # run fails the workflow. curate: needs: ingest if: needs.ingest.outputs.document_ids != '' runs-on: ubuntu-latest container: image: ${{ vars.COMPASS_MCP_IMAGE }} network: ${{ vars.COMPASS_NETWORK || 'compass_default' }} env: # Curator reads chunks + writes agent_runs into postgres-rag. POSTGRES_RAG_DSN: ${{ secrets.POSTGRES_RAG_DSN }} # Raw Graphiti graph (7688) — `gather_for_document` reads # episodes/entities traced back to the document. NEO4J_RAW_URI: ${{ secrets.NEO4J_RAW_URI }} NEO4J_RAW_USER: ${{ secrets.NEO4J_RAW_USER }} NEO4J_RAW_PASSWORD: ${{ secrets.NEO4J_RAW_PASSWORD }} # Structured graph (7687) — conflict-check reads the live schema. # `client_from_env` reads NEO4J_URI / NEO4J_USER / NEO4J_PASSWORD. NEO4J_URI: ${{ secrets.NEO4J_STRUCTURED_URI }} NEO4J_USER: ${{ secrets.NEO4J_STRUCTURED_USER }} NEO4J_PASSWORD: ${{ secrets.NEO4J_STRUCTURED_PASSWORD }} # OpenAI for classify + extract + score LLM calls. LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_BASE_URL: ${{ vars.LLM_BASE_URL || 'https://api.openai.com/v1' }} CURATOR_CLASSIFY_MODEL: ${{ vars.CURATOR_CLASSIFY_MODEL || 'gpt-4.1-mini' }} CURATOR_EXTRACT_MODEL: ${{ vars.CURATOR_EXTRACT_MODEL || 'gpt-4.1-mini' }} CURATOR_MIN_CONFIDENCE: ${{ vars.CURATOR_MIN_CONFIDENCE || '0.7' }} # Gitea — the curator opens PRs against compass/migrations. # Gitea rejects secret AND variable names prefixed with GITEA_ / # GITHUB_, so both are stored as COMPASS_GITEA_*. We remap them # back to GITEA_* here, which is what `GiteaConfig.from_env` reads. GITEA_URL: ${{ vars.COMPASS_GITEA_URL || 'http://gitea:3000' }} GITEA_TOKEN: ${{ secrets.COMPASS_GITEA_TOKEN }} GITEA_OWNER: ${{ vars.COMPASS_GITEA_OWNER || 'compass' }} GITEA_MIGRATIONS_REPO: ${{ vars.COMPASS_MIGRATIONS_REPO || 'migrations' }} steps: - name: Run curator per document shell: bash run: | set -euo pipefail any_failed=0 for doc_id in ${{ needs.ingest.outputs.document_ids }}; do echo "::group::curate $doc_id" # Don't let a single-document failure abort the loop — we # want all docs attempted so a poisoned PDF doesn't block # the rest of the batch. The job exits non-zero at the end # if any run returned 1 (status=failed). set +e compass-curate run --document-id "$doc_id" rc=$? set -e if [ "$rc" = "1" ]; then echo "::error::curator failed for $doc_id" any_failed=1 fi echo "::endgroup::" done exit "$any_failed"