knowledge-base/.gitea/workflows/ingest.yml
compass.admin 7531e31961
All checks were successful
ingest / ingest (push) Successful in 3s
ingest / curate (push) Has been skipped
curator: chain compass-curate after ingest
2026-04-23 17:50:45 +00:00

182 lines
7.9 KiB
YAML

name: ingest
# Fires on every push to main. Computes the diff against the parent
# commit, filters to supported extensions, then invokes `compass-ingest
# run --files <list>` inside a container pulled from
# ${{ vars.COMPASS_APP_IMAGE }}.
#
# The CLI talks directly to postgres-rag + the raw Graphiti Neo4j —
# there is no HTTP hop through the long-running app service.
on:
push:
branches:
- main
jobs:
ingest:
runs-on: ubuntu-latest
container:
image: ${{ vars.COMPASS_APP_IMAGE }}
# Same network override + volume allow-list as deploy.yml. See
# scripts/runner-config.yaml for the rationale.
network: ${{ vars.COMPASS_NETWORK || 'compass_default' }}
outputs:
# Space-separated list of UUIDs written by `compass-ingest`; the
# `curate` job fan-outs over these. Empty on no-op pushes.
document_ids: ${{ steps.run-ingest.outputs.document_ids }}
changed_count: ${{ steps.changes.outputs.changed_count }}
env:
# Pgvector chunk store. The CLI reads DATABASE_URL (ottomator
# legacy name); POSTGRES_RAG_DSN is kept for symmetry with the
# long-running app service env and the runbook.
DATABASE_URL: ${{ secrets.POSTGRES_RAG_DSN }}
POSTGRES_RAG_DSN: ${{ secrets.POSTGRES_RAG_DSN }}
# Raw Graphiti graph (7688). These are NOT the structured-graph
# write creds — those only exist in compass/migrations.
NEO4J_URI: ${{ secrets.NEO4J_RAW_URI }}
NEO4J_USER: ${{ secrets.NEO4J_RAW_USER }}
NEO4J_PASSWORD: ${{ secrets.NEO4J_RAW_PASSWORD }}
# LLM + embedding creds (Graphiti extraction + pgvector embed step).
# Re-used across both the free-form graph build and the chunker.
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
EMBEDDING_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_CHOICE: ${{ vars.LLM_CHOICE || 'gpt-4o-mini' }}
EMBEDDING_MODEL: ${{ vars.EMBEDDING_MODEL || 'text-embedding-3-small' }}
steps:
- name: Check out knowledge-base repo
uses: actions/checkout@v4
with:
# Depth 2 gives us $GITHUB_SHA and its parent so `git diff`
# can run. The initial-commit path (parent == 000…) is
# handled below.
fetch-depth: 2
- name: Compute changed files
id: changes
shell: bash
run: |
set -euo pipefail
BEFORE="${{ github.event.before }}"
AFTER="${{ github.sha }}"
# Gitea (like GitHub) reports 40 zeros as the "before" of an
# initial push or a branch creation. Fall back to the full
# file listing so we don't silently skip the first batch.
if [ -z "$BEFORE" ] || [ "$BEFORE" = "0000000000000000000000000000000000000000" ]; then
echo "initial commit or branch creation — ingesting full tree"
git ls-files > /tmp/changed-all.txt
else
# `--diff-filter=d` drops deletions: we don't want to ingest
# a file that no longer exists.
git diff --name-only --diff-filter=d "$BEFORE".."$AFTER" > /tmp/changed-all.txt
fi
# Filter to supported extensions. Keep this list in sync with
# the matcher in app/ingestion/pipeline.py.
grep -Ei '\.(md|txt|pdf|docx|xlsx)$' /tmp/changed-all.txt > /tmp/changed.txt || true
count=$(wc -l < /tmp/changed.txt | tr -d ' ')
echo "changed_count=$count" >> "$GITHUB_OUTPUT"
echo "changed files:"
cat /tmp/changed.txt || true
- name: Ingest changed files
id: run-ingest
if: steps.changes.outputs.changed_count != '0'
shell: bash
run: |
set -euo pipefail
# xargs feed avoids "argument list too long" on large pushes
# while still giving compass-ingest a single invocation so
# the shared DB/graph pools are reused.
mapfile -t files < /tmp/changed.txt
# `tee` so the human-readable log still shows the JSON summary
# while the file feeds the downstream id extraction.
compass-ingest run --files "${files[@]}" | tee /tmp/ingest-summary.json
# Extract UUIDs as space-separated string for the curate job
# fan-out. python3 is present on the compass-app image.
doc_ids="$(python3 -c 'import json,sys; d=json.load(open("/tmp/ingest-summary.json")); print(" ".join(r["document_id"] for r in d["results"] if not r["errors"]))')"
echo "document_ids=$doc_ids" >> "$GITHUB_OUTPUT"
echo "captured document_ids: $doc_ids"
- name: No-op summary
if: steps.changes.outputs.changed_count == '0'
run: echo "No supported files changed — nothing to ingest."
- name: Upload ingest log
if: always()
# v4 uses GHES Artifacts API v2 which Gitea doesn't implement yet.
uses: actions/upload-artifact@v3
with:
name: ingest-${{ github.sha }}
path: |
/tmp/changed.txt
/tmp/ingest-summary.json
# Curator stage — chained synchronously after ingest. Runs the
# RegulatoryObligation spec against each newly-ingested document_id,
# opens a PR against compass/migrations per accepted document. Skips
# (classified_skip / extracted) are non-fatal; only a `failed` curator
# run fails the workflow.
curate:
needs: ingest
if: needs.ingest.outputs.document_ids != ''
runs-on: ubuntu-latest
container:
image: ${{ vars.COMPASS_MCP_IMAGE }}
network: ${{ vars.COMPASS_NETWORK || 'compass_default' }}
env:
# Curator reads chunks + writes agent_runs into postgres-rag.
POSTGRES_RAG_DSN: ${{ secrets.POSTGRES_RAG_DSN }}
# Raw Graphiti graph (7688) — `gather_for_document` reads
# episodes/entities traced back to the document.
NEO4J_RAW_URI: ${{ secrets.NEO4J_RAW_URI }}
NEO4J_RAW_USER: ${{ secrets.NEO4J_RAW_USER }}
NEO4J_RAW_PASSWORD: ${{ secrets.NEO4J_RAW_PASSWORD }}
# Structured graph (7687) — conflict-check reads the live schema.
# `client_from_env` reads NEO4J_URI / NEO4J_USER / NEO4J_PASSWORD.
NEO4J_URI: ${{ secrets.NEO4J_STRUCTURED_URI }}
NEO4J_USER: ${{ secrets.NEO4J_STRUCTURED_USER }}
NEO4J_PASSWORD: ${{ secrets.NEO4J_STRUCTURED_PASSWORD }}
# OpenAI for classify + extract + score LLM calls.
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_BASE_URL: ${{ vars.LLM_BASE_URL || 'https://api.openai.com/v1' }}
CURATOR_CLASSIFY_MODEL: ${{ vars.CURATOR_CLASSIFY_MODEL || 'gpt-4.1-mini' }}
CURATOR_EXTRACT_MODEL: ${{ vars.CURATOR_EXTRACT_MODEL || 'gpt-4.1-mini' }}
CURATOR_MIN_CONFIDENCE: ${{ vars.CURATOR_MIN_CONFIDENCE || '0.7' }}
# Gitea — the curator opens PRs against compass/migrations.
# Gitea rejects secret AND variable names prefixed with GITEA_ /
# GITHUB_, so both are stored as COMPASS_GITEA_*. We remap them
# back to GITEA_* here, which is what `GiteaConfig.from_env` reads.
GITEA_URL: ${{ vars.COMPASS_GITEA_URL || 'http://gitea:3000' }}
GITEA_TOKEN: ${{ secrets.COMPASS_GITEA_TOKEN }}
GITEA_OWNER: ${{ vars.COMPASS_GITEA_OWNER || 'compass' }}
GITEA_MIGRATIONS_REPO: ${{ vars.COMPASS_MIGRATIONS_REPO || 'migrations' }}
steps:
- name: Run curator per document
shell: bash
run: |
set -euo pipefail
any_failed=0
for doc_id in ${{ needs.ingest.outputs.document_ids }}; do
echo "::group::curate $doc_id"
# Don't let a single-document failure abort the loop — we
# want all docs attempted so a poisoned PDF doesn't block
# the rest of the batch. The job exits non-zero at the end
# if any run returned 1 (status=failed).
set +e
compass-curate run --document-id "$doc_id"
rc=$?
set -e
if [ "$rc" = "1" ]; then
echo "::error::curator failed for $doc_id"
any_failed=1
fi
echo "::endgroup::"
done
exit "$any_failed"