Initial fleet ansible config - 2026-05-21T14:00:49-04:00

2026-05-21 14:00:49 -04:00
commit cea2a0ff15
8 changed files with 287 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,35 @@
+# Ansible Pull — Iron Legion Fleet
+
+Auto-applied Ansible playbooks for the Iron Legion AI agent fleet.
+
+## How It Works
+
+Each node runs `ansible-pull` every 5 minutes via cron. It clones this repo and applies `local.yml` to itself.
+
+## Repo Structure
+
+```
+.
+├── local.yml           # Main playbook — always runs
+├── group_vars/
+│   └── all.yml         # Fleet-wide variables
+├── host_vars/
+│   ├── artemis.yml     # Artemis (AI Foreman) specific
+│   ├── mark44.yml      # Mark44 (Hulkbuster) specific
+│   ├── mark5.yml       # Mark5 (Suitcase) specific
+│   └── bones.yml       # Bones (Mark XLI) specific
+└── roles/
+    └── common/
+        └── tasks/
+            └── main.yml
+```
+
+## Adding Node-Specific Tasks
+
+Edit the corresponding `host_vars/` file with node-specific vars (packages, configs). Edit `local.yml` for shared tasks that apply to all nodes.
+
+## Security
+
+- HTTPS auth via deploy token stored in `/etc/ansible/ansible.env`
+- Token is root-readable only (chmod 600)
+- Gitea provides TLS via NetBird mesh
--- a/group_vars/all.yml
+++ b/group_vars/all.yml
@@ -0,0 +1,10 @@
+---
+# Fleet-wide defaults applied to ALL nodes
+
+# Schedule for ansible-pull cron job
+ansible_pull_cron_schedule: "*/5 * * * *"
+
+# Gitea repo configuration
+gitea_base_url: "gitea.nb.bobbysh.me"
+gitea_org: "Iron-Legion"
+gitea_repo: "ansible-pull-deploy"
--- a/host_vars/artemis.yml
+++ b/host_vars/artemis.yml
@@ -0,0 +1,31 @@
+---
+# Artemis (AI Foreman) — Control node, no NVIDIA GPU
+node_type: foreman
+has_gpu: false
+
+# Artemis-specific packages (monitoring and control)
+extra_packages:
+  - nvtop                    # GPU monitoring (uses AMD iGPU info if available)
+  - nethogs                  # Per-process network monitoring
+  - iotop                    # Per-process I/O monitoring
+  - lm-sensors               # Temperature/fan monitoring
+  - stress-ng                # Load testing
+  - cockpit                  # Web-based system management
+
+# Services to manage (not auto-started, just ensure packages installed)
+managed_services:
+  - name: hermes-gateway
+    enabled: true
+  - name: hermes-dashboard
+    enabled: true
+
+# Ollama models for Artemis (CPU inference, small models only)
+ollama_models:
+  - gemma3:4b                # Small enough for CPU
+  - phi4-mini:latest         # Tiny, fast
+
+# Hermes configuration
+hermes_config:
+  provider: openrouter
+  model: openai/gpt-4o-mini
+  context_length: 128000
--- a/host_vars/bones.yml
+++ b/host_vars/bones.yml
@@ -0,0 +1,27 @@
+---
+# Bones (Mark XLI) — Headless CPU-only node
+node_type: headless
+has_gpu: false
+
+# Headless essentials
+extra_packages:
+  - cpufrequtils              # CPU frequency management
+  - lm-sensors               # Temperature monitoring
+  - smartmontools             # Disk health monitoring
+  - hdparm                    # Disk performance tuning
+  - netdata                   # lightweight monitoring (optional)
+
+# Services managed on Bones
+managed_services:
+  - name: jarvis              # Paperclip + Ollama + PostgreSQL stack
+    enabled: true
+  - name: ollama              # CPU inference only
+    enabled: true
+
+# Ollama config (CPU mode, very small models)
+ollama_models:
+  - gemma3:1b                # Ultra-tiny for CPU
+
+# Node-specific vars
+bones_storage: "256GB SSD"
+jvm_heap: "512m"
--- a/host_vars/hulkbuster.yml
+++ b/host_vars/hulkbuster.yml
@@ -0,0 +1,32 @@
+---
+# Mark44 (Hulkbuster) — Heavy GPU compute node
+node_type: gpu_heavy
+has_gpu: true
+gpu_type: nvidia
+gpu_model: "RTX 4070"
+vram_mb: 12282
+
+# GPU-specific tools (not drivers — Mark44 uses proprietary NVIDIA package)
+extra_packages:
+  - nvtop                     # GPU monitoring
+  - nethogs                   # Network per-process monitoring
+  - iotop                     # I/O per-process monitoring
+
+# Ollama models — largest VRAM headroom, can run big models
+ollama_models:
+  - gemma4:e4b               # Already pulled — keep it
+  - qwen2.5-coder:14b        # Primary coding model
+  - qwen2.5:14b             # General purpose large model
+
+# Services
+managed_services:
+  - name: ollama
+    enabled: true
+  - name: open-webui
+    enabled: true
+  - name: hermes-gateway
+    enabled: true
+
+# Ollama port override (standard)
+ollama_port: 11434
+open_webui_port: 8080
--- a/host_vars/mark5.yml
+++ b/host_vars/mark5.yml
@@ -0,0 +1,30 @@
+---
+# Mark5 (Suitcase) — Mobile/light GPU node
+node_type: gpu_light
+has_gpu: true
+gpu_type: nvidia
+gpu_model: "RTX 4060 Laptop"
+vram_mb: 8188
+
+# Laptop-specific packages
+extra_packages:
+  - nvtop                     # GPU monitoring
+  - powertop                  # Power management analysis
+  - tlp                       # Laptop power management
+  - htop                      # Already baseline, ensure present
+
+# Ollama models — limited VRAM, smaller models only
+ollama_models:
+  - qwen2.5-coder:7b         # Small coding model
+  - gemma3:4b                # Tiny, fast
+  - llama3.1:8b              # Balanced
+
+# Services
+managed_services:
+  - name: ollama
+    enabled: true
+  - name: hermes-gateway
+    enabled: true
+
+# Ollama port
+ollama_port: 11434
--- a/host_vars/nebuchadnezzar.yml
+++ b/host_vars/nebuchadnezzar.yml
@@ -0,0 +1,28 @@
+---
+# Neo (Nebuchadnezzar) — Services node
+# Nextcloud AIO + Vaultwarden ONLY. Debian.
+node_type: services
+has_gpu: false
+
+# Services-specific packages
+extra_packages:
+  - docker.io                # Nextcloud AIO is container-based
+  - docker-compose           # Compose for multi-service stacks
+  - apache2-utils            # htpasswd for Vaultwarden basic auth
+  - certbot                  # Let's Encrypt automation
+  - cron                     # Ensure cron is present
+
+# Services managed on Neo
+managed_services:
+  - name: nextcloud-aio
+    enabled: true
+  - name: vaultwarden
+    enabled: true
+
+# Ollama models: none — not an inference node
+ollama_models: []
+
+# Neo-specific facts
+neo_services:
+  - nextcloud_aio
+  - vaultwarden
--- a/local.yml
+++ b/local.yml
@@ -0,0 +1,94 @@
+---
+- hosts: localhost
+  connection: local
+  become: true
+  tasks:
+    # Load host-specific vars by actual system hostname
+    - name: Load host-specific variables
+      include_vars:
+        file: "host_vars/{{ ansible_hostname | lower }}.yml"
+      ignore_errors: true
+      tags: [vars, always]
+
+    - name: Print start message
+      debug:
+        msg: "Ansible Pull running on {{ ansible_hostname }} ({{ inventory_hostname }}) — role: {{ node_type | default('unspecified') }}"
+
+    # --- ALL NODES: baseline ---
+    - name: Ensure apt packages are updated
+      apt:
+        update_cache: yes
+        cache_valid_time: 3600
+      when: ansible_os_family == "Debian"
+      tags: [baseline]
+
+    - name: Ensure common packages installed
+      apt:
+        name:
+          - curl
+          - git
+          - htop
+          - tmux
+          - jq
+          - vim
+          - python3-pip
+        state: present
+      when: ansible_os_family == "Debian"
+      tags: [baseline]
+
+    # --- NODE-SPECIFIC: extra packages ---
+    - name: Ensure node-specific extra packages installed
+      apt:
+        name: "{{ extra_packages }}"
+        state: present
+      when:
+        - ansible_os_family == "Debian"
+        - extra_packages is defined
+        - extra_packages | length > 0
+      tags: [node_specific]
+
+    # --- NODE-SPECIFIC: Ollama model management ---
+    - name: Ensure Ollama is installed
+      command: which ollama
+      register: ollama_check
+      ignore_errors: true
+      changed_when: false
+      tags: [ollama]
+
+    - name: Pull node-specific Ollama models
+      command: "ollama pull {{ item }}"
+      loop: "{{ ollama_models }}"
+      when:
+        - ollama_check.rc == 0
+        - ollama_models is defined
+        - ollama_models | length > 0
+      register: ollama_pull_result
+      tags: [ollama]
+
+    # --- NODE-SPECIFIC: Service management (placeholder) ---
+    - name: Ensure managed services are enabled
+      systemd:
+        name: "{{ item.name }}"
+        enabled: "{{ item.enabled | default(true) }}"
+      loop: "{{ managed_services }}"
+      when:
+        - managed_services is defined
+        - managed_services | length > 0
+      ignore_errors: true
+      tags: [services]
+
+    # --- Artemis-specific: monitoring dashboard ---
+    - name: Ensure Artemis cockpit available
+      apt:
+        name:
+          - cockpit
+          - cockpit-pcp
+        state: present
+      when:
+        - inventory_hostname == "artemis.ai.home" or ansible_hostname == "artemis"
+        - ansible_os_family == "Debian"
+      tags: [artemis]
+
+    - name: Print completion message
+      debug:
+        msg: "Baseline complete on {{ ansible_hostname }} — node_type={{ node_type | default('unspecified') }}, gpu={{ has_gpu | default(false) }}"