""" ollama_client.py — Shared Ollama client for all Performance West monitor scripts. Handles model warmup, session reuse via keep_alive, and SSH tunneling. """ import os import json import time import subprocess import urllib.request import logging from pathlib import Path log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- ON_PROD = os.path.exists("/opt/performancewest") REMOTE_HOST = "172.18.0.2" REMOTE_PORT = "11434" TUNNEL_PORT = "11435" SSH_TARGET = "justin@207.174.124.70" SSH_PORT = "22022" SSH_KEY = str(Path.home() / ".ssh/id_ed25519") OLLAMA_BASE = f"http://{REMOTE_HOST}:{REMOTE_PORT}" if ON_PROD else f"http://localhost:{TUNNEL_PORT}" MODEL = "qwen2.5:3b" TIMEOUT = 300 KEEP_ALIVE = "30m" # Keep model loaded for 30 minutes between calls _tunnel_proc = None _warmed_up = False # --------------------------------------------------------------------------- # Tunnel management (local dev only) # --------------------------------------------------------------------------- def start_tunnel() -> bool: global _tunnel_proc, OLLAMA_BASE if ON_PROD: log.info(f"Production: Ollama at {OLLAMA_BASE}") return True # Check if tunnel already open try: urllib.request.urlopen(f"{OLLAMA_BASE}/api/tags", timeout=3) log.info("Ollama tunnel already open") return True except Exception: pass log.info("Opening SSH tunnel to Ollama...") _tunnel_proc = subprocess.Popen([ "ssh", "-N", "-L", f"{TUNNEL_PORT}:{REMOTE_HOST}:{REMOTE_PORT}", "-p", SSH_PORT, "-i", SSH_KEY, "-o", "StrictHostKeyChecking=no", "-o", "ExitOnForwardFailure=yes", SSH_TARGET, ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) for _ in range(15): time.sleep(1) try: urllib.request.urlopen(f"{OLLAMA_BASE}/api/tags", timeout=2) log.info("Tunnel ready") return True except Exception: pass log.error("Failed to open tunnel") return False def stop_tunnel(): global _tunnel_proc if _tunnel_proc: _tunnel_proc.terminate() _tunnel_proc = None # --------------------------------------------------------------------------- # Model warmup # --------------------------------------------------------------------------- def warmup(): """Pre-load the model into GPU/RAM so first real call is fast.""" global _warmed_up if _warmed_up: return log.info(f"Warming up {MODEL}...") try: generate("Say OK", system="You are a test.", max_tokens=5) _warmed_up = True log.info("Model loaded and warm") except Exception as e: log.warning(f"Warmup failed: {e}") # --------------------------------------------------------------------------- # Generate # --------------------------------------------------------------------------- def generate(prompt: str, system: str = "", max_tokens: int = 200, temperature: float = 0.6) -> str: """ Generate a response from Ollama. Keeps the model loaded between calls via keep_alive parameter. """ payload = json.dumps({ "model": MODEL, "system": system, "prompt": prompt, "stream": False, "keep_alive": KEEP_ALIVE, "options": { "temperature": temperature, "num_predict": max_tokens, }, }).encode() req = urllib.request.Request( f"{OLLAMA_BASE}/api/generate", data=payload, headers={"Content-Type": "application/json"}, method="POST", ) with urllib.request.urlopen(req, timeout=TIMEOUT) as r: data = json.loads(r.read()) return data.get("response", "").strip()