ara/ss14-cicd/trace/exploration_tree.yaml

# Exploration Tree — ss14-cicd
# Research DAG: nested tree with cross-edges (also_depends_on) forming a DAG.
# Source: Session logs 2025-12-14 through 2025-12-19 (HISTORY.md)
# Node types: question | experiment | dead_end | decision | pivot

tree:
  - id: N01
    type: question
    support_level: explicit
    source_refs: ["HISTORY.md: 2025-12-14"]
    title: "How to build wylab-station-14 Docker image automatically on commit?"
    description: >
      The wylab-station-14 Space Station 14 game server needs automated builds
      via Gitea Actions on git.wylab.me. Commits to the repo should trigger a
      pipeline that builds the Docker image and makes it available for deployment
      on the Unraid server.
    children:

      - id: N02
        type: experiment
        support_level: explicit
        source_refs: ["HISTORY.md: 2025-12-14"]
        title: "Set up act-runner on Unraid (container mode, bridge network)"
        result: >
          Pipeline did not trigger on commits. Runner registered successfully but
          job containers could not resolve git.wylab.me — DNS resolution failed
          inside the runner job containers. Host networking partially worked:
          1 out of 6 jobs succeeded, inconsistently.
        evidence: ["C03", "HISTORY.md: 2025-12-14"]
        children:

          - id: N03
            type: dead_end
            support_level: explicit
            source_refs: ["HISTORY.md: 2025-12-14"]
            title: "Adding 1.1.1.1 DNS to runner container"
            hypothesis: "External DNS resolver will allow git.wylab.me resolution from job containers"
            failure_mode: >
              1.1.1.1 cannot resolve private internal hostnames. git.wylab.me is only
              resolvable via Technitium DNS at 192.168.1.50. Public DNS has no record for it.
            lesson: >
              DNS config must target job containers specifically (not the runner process container),
              AND must point to an internal resolver that knows git.wylab.me.

          - id: N04
            type: dead_end
            support_level: explicit
            source_refs: ["HISTORY.md: 2025-12-14"]
            title: "Host networking mode on Unraid runner"
            hypothesis: "Host network mode gives job containers access to host DNS resolver"
            failure_mode: >
              Inconsistent: only 1 of 6 job containers successfully resolved hostnames.
              Multiple reverts required. Exact root cause of inconsistency not determined.
              Reverted changes.
            lesson: >
              Host networking is not a stable fix. The correct fix is to configure
              container.dns in runner config.yml to point at Docker bridge gateway (172.17.0.1)
              which forwards to Technitium.

          - id: N05
            type: decision
            support_level: explicit
            source_refs: ["HISTORY.md: 2025-12-15"]
            title: "Move to external VPS runner to bypass Unraid networking constraints"
            choice: "Deploy act-runner on external VPS (45.137.68.83) with direct internet access"
            alternatives:
              - "Continue debugging Unraid container runner DNS"
              - "Use host networking on Unraid runner (unstable)"
              - "Switch to a different CI system (Drone CI, Jenkins)"
            evidence: "1/6 success rate on Unraid was deemed insufficient; external VPS has direct DNS access"
            children:

              - id: N06
                type: experiment
                support_level: explicit
                source_refs: ["HISTORY.md: 2025-12-15"]
                title: "External VPS runner (45.137.68.83) with native Gitea cache"
                result: >
                  Runner started but had persistent Node.js module errors: "Cannot find module"
                  in /opt/gitea-runner/.cache/act/. SSH debugging sessions required.
                  Native Gitea cache server (act-cache-server) timed out: ETIMEDOUT on
                  45.137.68.83:39913. .NET cache step took 5 minutes vs 5 seconds for other steps —
                  indicating full cache miss every build.
                evidence: ["C02", "C03", "HISTORY.md: 2025-12-15"]
                children:

                  - id: N07
                    type: dead_end
                    support_level: explicit
                    source_refs: ["HISTORY.md: 2025-12-15"]
                    title: "Native Gitea act-cache-server on external VPS"
                    hypothesis: "Native Gitea cache server will provide fast cache hits for .NET packages"
                    failure_mode: >
                      ETIMEDOUT connecting to 45.137.68.83:39913 from inside job containers.
                      Every build was a full cold build (5 min cache step vs 5 sec).
                      Likely cause: firewall or Docker bridge network blocking port 39913.
                    lesson: >
                      Native act-cache-server requires port 39913 reachable from job containers.
                      Local file cache (volume mount) is more reliable — bypasses HTTP protocol.

                  - id: N08
                    type: decision
                    support_level: explicit
                    source_refs: ["HISTORY.md: 2025-12-18"]
                    title: "Add macOS ARM64 runner (OrbStack) as additional runner"
                    choice: "Register Mac ARM64 OrbStack runner to supplement or replace external VPS runner"
                    alternatives:
                      - "Continue debugging external VPS runner"
                      - "Return to Unraid container runner with DNS fix"
                      - "Replace Gitea Actions with alternative CI (Drone, Jenkins, GitHub Actions)"
                    evidence: "External VPS runner crashing under load; developer Mac available with Docker via OrbStack"
                    children:

                      - id: N09
                        type: experiment
                        support_level: explicit
                        source_refs: ["HISTORY.md: 2025-12-18", "HISTORY.md: 2025-12-19"]
                        title: "macOS ARM64 OrbStack runner — capacity and cache tuning"
                        result: >
                          Runner worked initially. OOM crashes under concurrent dotnet builds forced
                          capacity reduction: 6 → 4 → 3 → 2 concurrent jobs. Local file cache
                          configured (stable). shutdown_timeout: 30m added to prevent zombie containers.
                          However: mixed architecture with Unraid runner caused cache corruption —
                          arm64 cache entries consumed by x86-64 jobs, producing wrong-arch artifacts
                          silently. Runner kept crashing under load. Unresolved as of 2025-12-19.
                        evidence: ["C01", "C04", "HISTORY.md: 2025-12-19"]
                        children:

                          - id: N10
                            type: dead_end
                            support_level: explicit
                            source_refs: ["HISTORY.md: 2025-12-18"]
                            title: "Running two act-runner instances on same Mac host"
                            hypothesis: "Second runner instance increases parallel capacity"
                            failure_mode: >
                              Second runner registration immediately broke the existing first runner.
                              Port/socket conflicts between instances. Had to delete runner 2 and revert.
                            lesson: >
                              Multiple act-runner instances on same host require distinct ports,
                              work directories, and config paths. Simpler: one runner per host,
                              tune capacity instead.

                          - id: N11
                            type: decision
                            support_level: inferred
                            title: "Architecture-tagged cache keys as fix for silent cache corruption"
                            choice: >
                              Encode runner.arch in cache keys: 'dotnet-${{ runner.arch }}-${{ hashFiles(...) }}'
                              OR pin all builds to Unraid runner via runs-on label
                            alternatives:
                              - "Continue with architecture-agnostic keys (known failure mode)"
                              - "Abandon Mac runner entirely, return to Unraid-only runner"
                              - "Use separate cache backends per runner (NFS mount isolation)"
                            evidence: >
                              Root cause analysis: arm64 and amd64 runners with identical
                              project file hashes produce identical cache keys but incompatible artifacts.
                              Architecture in key makes the key injective in architecture — no collision possible.