new-site/scripts/reddit-monitor.py

#!/usr/bin/env python3
"""
reddit-monitor.py — Monitor Reddit for compliance-related questions relevant to
Performance West, generate helpful replies with Ollama (qwen2.5:3b), and post them.

Targets: r/smallbusiness, r/Entrepreneur, r/tax, r/legaladvice,
         r/Bookkeeping, r/accounting, r/humanresources, r/QuickBooks, r/IRS,
         r/ecommerce, r/marketing, r/realestateinvesting, r/restaurateur,
         r/construction, r/antiwork, r/EmploymentLaw, r/freelance, r/startups,
         r/payroll

State:  ~/.reddit-monitor-state.json
Log:    ~/logs/reddit-monitor.log
"""

import os, sys, json, time, re, random, logging, urllib.request, urllib.parse, fcntl
from pathlib import Path

# Single-instance lock — acquire before anything else including logging setup
_LOCK_FILE = open("/tmp/reddit-monitor.lock", "w")
try:
    fcntl.flock(_LOCK_FILE, fcntl.LOCK_EX | fcntl.LOCK_NB)
except OSError:
    sys.exit(0)  # Another instance running — exit silently

sys.path.insert(0, os.path.dirname(__file__))
from alert import alert_account_broken
from product_facts import get_product_facts
import ollama_client
from gap_tracker import log_gap
from datetime import datetime, timezone

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

REDDIT_CLIENT_ID     = os.environ.get("REDDIT_CLIENT_ID", "")
REDDIT_CLIENT_SECRET = os.environ.get("REDDIT_CLIENT_SECRET", "")
REDDIT_USERNAME      = os.environ.get("REDDIT_USERNAME", "")
REDDIT_PASSWORD      = os.environ.get("REDDIT_PASSWORD", "")
REDDIT_USER_AGENT    = "PerfWestBot/1.0 (by /u/performancewest)"

STATE_FILE     = Path.home() / ".reddit-monitor-state.json"
LOG_DIR        = Path.home() / "logs"
LOG_DIR.mkdir(exist_ok=True)

# Rate limits
MAX_REPLIES_PER_RUN      = 3
MAX_REPLIES_PER_SUBREDDIT = 1
PAUSE_BETWEEN_MIN        = 5   # minutes
PAUSE_BETWEEN_MAX        = 15  # minutes
DAILY_LIMIT              = 10
MAX_AGE_DAYS             = 7

# ---------------------------------------------------------------------------
# Subreddits to monitor
# ---------------------------------------------------------------------------

SUBREDDITS = [
    # TIER 1 — Highest volume, business owners asking compliance questions
    "smallbusiness",        # 470K — constant contractor/LLC/compliance posts
    "Entrepreneur",         # 470K — formation, contractor, privacy questions
    "tax",                  # 841K — 1099 vs W-2 daily, misclassification gold
    "legaladvice",          # 1.6M — employee-side misclassif posts (shows employer risk)
    # TIER 2 — Professionals who refer clients + direct compliance Q&A
    "Bookkeeping",          # 75K  — 1099 processing, payroll compliance, QBO/Xero
    "accounting",           # 1.2M — broad but huge; contractor classification
    "humanresources",       # 107K — FLSA, handbooks, discrimination, HR policies
    "QuickBooks",           # 37K  — payroll/1099 compliance in QB context
    "IRS",                  # 442K — enforcement notices, compliance questions
    # TIER 3 — Industry-specific (highest misclassification/wage-hour risk)
    "ecommerce",            # 91K  — CCPA, privacy policies, SMS marketing
    "marketing",            # 141K — TCPA, SMS consent, DNC
    "realestateinvesting",  # contractor classification, entity formation
    "restaurateur",         # wage-hour violations (huge in food service)
    "construction",         # contractor misclassification (#1 violating industry)
    "antiwork",             # 1.6M — misclassif/wage theft posts get massive engagement
    "EmploymentLaw",        # 7.1K — small but 100% signal, every post is compliance
    "freelance",            # the "other side" of contractor misclassification
    "startups",             # 1.2M — business formation, early compliance
    "payroll",              # payroll tax compliance, misclassification
]

# ---------------------------------------------------------------------------
# Keyword triggers by compliance category
# ---------------------------------------------------------------------------

COMPLIANCE_KEYWORDS = {
    "flsa": [
        "FLSA", "wage and hour", "overtime violation", "exempt vs nonexempt",
        "minimum wage", "off the clock", "meal break violation",
        "unpaid overtime", "salary threshold", "wage theft",
        "DOL audit", "Department of Labor",
    ],
    "misclassification": [
        "1099 vs W-2", "1099 vs W2", "independent contractor",
        "misclassification", "misclassified", "contractor or employee",
        "IC vs employee", "gig worker classification",
        "pay contractor", "paying 1099", "1099 worker",
        "contractor to employee", "should I 1099",
    ],
    "discrimination": [
        "workplace discrimination", "harassment policy", "Title VII",
        "ADA compliance", "hostile work environment", "DEI policy",
        "pay equity", "retaliation claim", "EEOC",
    ],
    "privacy": [
        "CCPA", "CPRA", "privacy policy", "data privacy", "opt-out request",
        "cookie consent", "data breach notification", "biometric data",
        "privacy compliance", "do not sell", "consumer rights request",
    ],
    "tcpa": [
        "TCPA", "robocall", "SMS marketing", "text message consent",
        "do not call", "DNC list", "autodialer",
        "prior express written consent", "one-to-one consent",
        "SMS campaign sued", "text marketing compliance",
    ],
    "corporate": [
        "LLC formation", "form an LLC", "register a business",
        "annual report filing", "registered agent", "foreign qualification",
        "state registration", "business formation", "incorporate",
        "S-Corp election", "C-Corp vs S-Corp", "EIN",
        "operating agreement", "good standing",
    ],
    "telecom": [
        "FCC 499A", "STIR/SHAKEN", "telecom compliance",
        "IPES registration", "ISP registration", "robocall attestation",
        "FCC registration", "CLEC", "telecom license",
    ],
    "payroll": [
        "payroll compliance", "payroll tax", "W-4", "Form 941",
        "employer taxes", "FUTA", "SUTA", "withholding",
        "QuickBooks payroll", "Xero payroll", "payroll setup",
    ],
}

# Flatten all keywords for quick scanning
ALL_KEYWORDS = []
for kws in COMPLIANCE_KEYWORDS.values():
    ALL_KEYWORDS.extend(kws)

# ---------------------------------------------------------------------------
# System prompt
# ---------------------------------------------------------------------------

def build_system_prompt() -> str:
    return f"""You are Justin, the owner of Performance West (https://performancewest.net),
a compliance consulting firm helping US small and mid-size businesses navigate
employment, privacy, TCPA, corporate, and telecom compliance.

=== PRODUCT FACTS (authoritative — use these exactly, never claim anything not listed) ===
{get_product_facts()}
=== END PRODUCT FACTS ===

You are replying to a Reddit post where someone has a compliance-related question.

QUALIFICATION RULES — if skipping, respond ONLY with "SKIP: <one sentence reason>".
Skip if ANY of these are true:
- The person needs legal advice or legal representation (we are consultants, not attorneys)
- The person mentions they already have an attorney handling this
- The person is involved in active litigation (plaintiff or defendant)
- The person is outside the US (we only serve US businesses)
- The person is at a large enterprise (500+ employees — not our market)
- The question is about tax preparation or CPA-level tax advice
- The question is clearly academic or a student assignment
- Performance West's services would not meaningfully help their specific problem
- The compliance area is NOT covered by our services

REPLY RULES (only if not SKIPped):
- Be genuinely helpful and educational — answer their question first
- Explain the compliance concept clearly in plain language
- Only mention Performance West if it's directly relevant to their situation
- If we have a free tool (FLSA calculator, privacy policy generator, contractor quiz),
  mention it naturally — people love free resources
- Never provide legal advice or say "you should do X" — instead explain what the
  regulations generally require and suggest they get professional guidance
- Keep it conversational and helpful, not salesy
- Stay under 250 words
- Sign off with a new line and "-- Justin"
- Do NOT use markdown headers or bullet lists — Reddit comments should feel natural"""

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_DIR / "reddit-monitor.log"),
    ],
)
log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# State
# ---------------------------------------------------------------------------

def load_state():
    if STATE_FILE.exists():
        try:
            return json.loads(STATE_FILE.read_text())
        except Exception:
            pass
    return {
        "seen_ids": [],
        "replied_ids": [],
        "daily_count": 0,
        "daily_reset": "",
        "subreddit_counts": {},
    }

def save_state(state):
    STATE_FILE.write_text(json.dumps(state, indent=2))

def reset_daily_if_needed(state):
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    if state.get("daily_reset") != today:
        state["daily_count"] = 0
        state["daily_reset"] = today
        state["subreddit_counts"] = {}
    return state

# ---------------------------------------------------------------------------
# Reddit OAuth
# ---------------------------------------------------------------------------

_access_token = None
_token_expiry = 0

def reddit_auth():
    """Get Reddit OAuth2 access token using script-type app credentials."""
    global _access_token, _token_expiry

    if _access_token and time.time() < _token_expiry:
        return _access_token

    if not REDDIT_CLIENT_ID or not REDDIT_CLIENT_SECRET:
        log.error("Reddit credentials not set in environment")
        return None

    data = urllib.parse.urlencode({
        "grant_type": "password",
        "username": REDDIT_USERNAME,
        "password": REDDIT_PASSWORD,
    }).encode()

    # HTTP Basic auth with client_id:client_secret
    import base64
    credentials = base64.b64encode(f"{REDDIT_CLIENT_ID}:{REDDIT_CLIENT_SECRET}".encode()).decode()

    req = urllib.request.Request(
        "https://www.reddit.com/api/v1/access_token",
        data=data,
        headers={
            "Authorization": f"Basic {credentials}",
            "User-Agent": REDDIT_USER_AGENT,
            "Content-Type": "application/x-www-form-urlencoded",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=15) as r:
            resp = json.loads(r.read())
            _access_token = resp.get("access_token")
            _token_expiry = time.time() + resp.get("expires_in", 3600) - 60
            return _access_token
    except Exception as e:
        log.error(f"Reddit auth failed: {e}")
        return None

# ---------------------------------------------------------------------------
# Reddit API helpers
# ---------------------------------------------------------------------------

def reddit_get(path, params=None):
    """GET request to Reddit OAuth API."""
    token = reddit_auth()
    if not token:
        return {}

    p = urllib.parse.urlencode(params or {})
    url = f"https://oauth.reddit.com{path}"
    if p:
        url += f"?{p}"

    req = urllib.request.Request(
        url,
        headers={
            "Authorization": f"Bearer {token}",
            "User-Agent": REDDIT_USER_AGENT,
        },
    )

    try:
        with urllib.request.urlopen(req, timeout=15) as r:
            return json.loads(r.read())
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")[:300]
        log.warning(f"Reddit GET {path}: {e.code} {body}")
        if e.code in (401, 403):
            alert_account_broken("reddit-monitor", "Reddit", f"HTTP {e.code}", body)
        return {}
    except Exception as e:
        log.warning(f"Reddit GET {path}: {e}")
        return {}


def reddit_post_comment(thing_id, text):
    """Post a comment reply to a Reddit submission or comment."""
    token = reddit_auth()
    if not token:
        return {}

    data = urllib.parse.urlencode({
        "thing_id": thing_id,
        "text": text,
    }).encode()

    req = urllib.request.Request(
        "https://oauth.reddit.com/api/comment",
        data=data,
        headers={
            "Authorization": f"Bearer {token}",
            "User-Agent": REDDIT_USER_AGENT,
            "Content-Type": "application/x-www-form-urlencoded",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=15) as r:
            return json.loads(r.read())
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")[:500]
        log.warning(f"Reddit POST comment: {e.code} {body}")
        if e.code in (401, 403):
            alert_account_broken("reddit-monitor", "Reddit", f"HTTP {e.code} on comment", body)
        return {"error": e.code, "body": body}
    except Exception as e:
        log.warning(f"Reddit POST comment: {e}")
        return {"error": str(e)}

# ---------------------------------------------------------------------------
# Keyword matching
# ---------------------------------------------------------------------------

def matches_keywords(title: str, body: str) -> list[str]:
    """Return list of matched compliance categories."""
    text = (title + " " + body).lower()
    matched = []
    for category, keywords in COMPLIANCE_KEYWORDS.items():
        for kw in keywords:
            if kw.lower() in text:
                matched.append(category)
                break
    return matched

# ---------------------------------------------------------------------------
# Ollama
# ---------------------------------------------------------------------------

def generate_reply(title: str, body_text: str, categories: list[str], subreddit: str) -> str:
    categories_str = ", ".join(categories)
    prompt = f"""Reddit post in r/{subreddit}:

Title: {title}

Body (first 800 chars): {body_text[:800]}

Matched compliance categories: {categories_str}

Write a helpful Reddit comment reply. Answer their question first, then mention Performance West only if directly relevant."""

    return ollama_client.generate(prompt, system=build_system_prompt(), max_tokens=350)

# ---------------------------------------------------------------------------
# Post logger
# ---------------------------------------------------------------------------

def log_post(platform, target_url, target_title, content):
    posts_log = LOG_DIR / "posts.log"
    entry = {
        "ts": datetime.now(timezone.utc).isoformat(),
        "platform": platform,
        "url": target_url,
        "title": target_title,
        "content": content,
    }
    with open(posts_log, "a") as f:
        f.write(json.dumps(entry) + "\n")

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    log.info("=== Reddit monitor starting ===")

    state = load_state()
    state = reset_daily_if_needed(state)

    if state["daily_count"] >= DAILY_LIMIT:
        log.info(f"Daily reply limit reached ({DAILY_LIMIT}). Skipping.")
        return

    # Verify Reddit auth works
    token = reddit_auth()
    if not token:
        log.error("Cannot authenticate with Reddit. Aborting.")
        alert_account_broken("reddit-monitor", "Reddit", "OAuth authentication failed")
        return

    if not ollama_client.start_tunnel():
        log.error("Cannot reach Ollama. Aborting.")
        return
    ollama_client.warmup()

    try:
        seen_ids = set(state.get("seen_ids", []))
        replied_ids = set(state.get("replied_ids", []))
        subreddit_counts = state.get("subreddit_counts", {})
        replies_this_run = 0
        cutoff = time.time() - (MAX_AGE_DAYS * 86400)

        # Shuffle subreddits to avoid always hitting the same ones first
        subs = list(SUBREDDITS)
        random.shuffle(subs)

        for subreddit in subs:
            if replies_this_run >= MAX_REPLIES_PER_RUN:
                break
            if state["daily_count"] >= DAILY_LIMIT:
                break
            if subreddit_counts.get(subreddit, 0) >= MAX_REPLIES_PER_SUBREDDIT:
                log.info(f"r/{subreddit}: already replied this run, skipping")
                continue

            log.info(f"Scanning r/{subreddit}...")

            # Fetch new posts from the subreddit
            resp = reddit_get(f"/r/{subreddit}/new", {"limit": 25})
            posts = resp.get("data", {}).get("children", [])

            for post_wrapper in posts:
                if replies_this_run >= MAX_REPLIES_PER_RUN:
                    break
                if state["daily_count"] >= DAILY_LIMIT:
                    break

                post = post_wrapper.get("data", {})
                post_id = post.get("id", "")
                fullname = post.get("name", "")  # t3_xxxxx
                title = post.get("title", "")
                body = post.get("selftext", "")
                created = post.get("created_utc", 0)
                permalink = post.get("permalink", "")
                post_url = f"https://reddit.com{permalink}" if permalink else ""
                num_comments = post.get("num_comments", 0)

                # Skip if already processed
                if post_id in seen_ids or post_id in replied_ids:
                    continue

                # Skip if too old
                if created < cutoff:
                    seen_ids.add(post_id)
                    continue

                # Skip link-only posts (no selftext)
                if not body or len(body.strip()) < 30:
                    seen_ids.add(post_id)
                    continue

                # Check keyword match
                categories = matches_keywords(title, body)
                if not categories:
                    seen_ids.add(post_id)
                    continue

                seen_ids.add(post_id)
                log.info(f"  [r/{subreddit}] '{title[:65]}' (id:{post_id}) categories:{categories}")

                # Generate reply with LLM
                try:
                    reply = generate_reply(title, body, categories, subreddit)
                except Exception as e:
                    log.warning(f"  Ollama error: {e}")
                    continue

                if not reply or reply.strip().upper().startswith("SKIP"):
                    skip_reason = reply.strip()[4:].strip(" :-") if reply and len(reply.strip()) > 4 else ""
                    log.info(f"  -> Skipped: {skip_reason or 'not relevant'}")
                    if skip_reason:
                        log_gap("Reddit", post_url, title, body[:300], skip_reason)
                    continue

                # Post the reply
                log.info(f"  -> Posting reply ({len(reply)} chars)...")
                result = reddit_post_comment(fullname, reply)

                # Check for success — Reddit returns nested jquery structure
                if result.get("error"):
                    err = str(result.get("error", ""))
                    detail = result.get("body", "")
                    log.warning(f"  -> Failed to post: {err}")
                    if any(x in str(detail).lower() for x in [
                        "forbidden", "banned", "suspended", "rate limit",
                        "unauthorized", "invalid_grant",
                    ]):
                        alert_account_broken("reddit-monitor", "Reddit", err, detail)
                    continue

                # Success
                log.info(f"  -> Posted reply to r/{subreddit}")
                log_post("Reddit", post_url, title, reply)
                replied_ids.add(post_id)
                replies_this_run += 1
                state["daily_count"] += 1
                subreddit_counts[subreddit] = subreddit_counts.get(subreddit, 0) + 1

                # Human-like pause between replies
                if replies_this_run < MAX_REPLIES_PER_RUN:
                    pause = random.uniform(
                        PAUSE_BETWEEN_MIN * 60,
                        PAUSE_BETWEEN_MAX * 60,
                    )
                    log.info(f"  -> Pausing {pause/60:.1f} min before next reply...")
                    time.sleep(pause)

                break  # Move to next subreddit after replying (1 per sub)

            # Don't hammer Reddit API
            time.sleep(2)

        # Save state — trim seen_ids to prevent unbounded growth
        state["seen_ids"] = list(seen_ids)[-5000:]
        state["replied_ids"] = list(replied_ids)
        state["subreddit_counts"] = subreddit_counts
        save_state(state)
        log.info(f"=== Done. {replies_this_run} replies posted this run. ===")

    finally:
        ollama_client.stop_tunnel()


if __name__ == "__main__":
    main()