new-site/scripts/reddit-monitor.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

551 lines
21 KiB
Python

#!/usr/bin/env python3
"""
reddit-monitor.py — Monitor Reddit for compliance-related questions relevant to
Performance West, generate helpful replies with Ollama (qwen2.5:3b), and post them.
Targets: r/smallbusiness, r/Entrepreneur, r/tax, r/legaladvice,
r/Bookkeeping, r/accounting, r/humanresources, r/QuickBooks, r/IRS,
r/ecommerce, r/marketing, r/realestateinvesting, r/restaurateur,
r/construction, r/antiwork, r/EmploymentLaw, r/freelance, r/startups,
r/payroll
State: ~/.reddit-monitor-state.json
Log: ~/logs/reddit-monitor.log
"""
import os, sys, json, time, re, random, logging, urllib.request, urllib.parse, fcntl
from pathlib import Path
# Single-instance lock — acquire before anything else including logging setup
_LOCK_FILE = open("/tmp/reddit-monitor.lock", "w")
try:
fcntl.flock(_LOCK_FILE, fcntl.LOCK_EX | fcntl.LOCK_NB)
except OSError:
sys.exit(0) # Another instance running — exit silently
sys.path.insert(0, os.path.dirname(__file__))
from alert import alert_account_broken
from product_facts import get_product_facts
import ollama_client
from gap_tracker import log_gap
from datetime import datetime, timezone
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
REDDIT_CLIENT_ID = os.environ.get("REDDIT_CLIENT_ID", "")
REDDIT_CLIENT_SECRET = os.environ.get("REDDIT_CLIENT_SECRET", "")
REDDIT_USERNAME = os.environ.get("REDDIT_USERNAME", "")
REDDIT_PASSWORD = os.environ.get("REDDIT_PASSWORD", "")
REDDIT_USER_AGENT = "PerfWestBot/1.0 (by /u/performancewest)"
STATE_FILE = Path.home() / ".reddit-monitor-state.json"
LOG_DIR = Path.home() / "logs"
LOG_DIR.mkdir(exist_ok=True)
# Rate limits
MAX_REPLIES_PER_RUN = 3
MAX_REPLIES_PER_SUBREDDIT = 1
PAUSE_BETWEEN_MIN = 5 # minutes
PAUSE_BETWEEN_MAX = 15 # minutes
DAILY_LIMIT = 10
MAX_AGE_DAYS = 7
# ---------------------------------------------------------------------------
# Subreddits to monitor
# ---------------------------------------------------------------------------
SUBREDDITS = [
# TIER 1 — Highest volume, business owners asking compliance questions
"smallbusiness", # 470K — constant contractor/LLC/compliance posts
"Entrepreneur", # 470K — formation, contractor, privacy questions
"tax", # 841K — 1099 vs W-2 daily, misclassification gold
"legaladvice", # 1.6M — employee-side misclassif posts (shows employer risk)
# TIER 2 — Professionals who refer clients + direct compliance Q&A
"Bookkeeping", # 75K — 1099 processing, payroll compliance, QBO/Xero
"accounting", # 1.2M — broad but huge; contractor classification
"humanresources", # 107K — FLSA, handbooks, discrimination, HR policies
"QuickBooks", # 37K — payroll/1099 compliance in QB context
"IRS", # 442K — enforcement notices, compliance questions
# TIER 3 — Industry-specific (highest misclassification/wage-hour risk)
"ecommerce", # 91K — CCPA, privacy policies, SMS marketing
"marketing", # 141K — TCPA, SMS consent, DNC
"realestateinvesting", # contractor classification, entity formation
"restaurateur", # wage-hour violations (huge in food service)
"construction", # contractor misclassification (#1 violating industry)
"antiwork", # 1.6M — misclassif/wage theft posts get massive engagement
"EmploymentLaw", # 7.1K — small but 100% signal, every post is compliance
"freelance", # the "other side" of contractor misclassification
"startups", # 1.2M — business formation, early compliance
"payroll", # payroll tax compliance, misclassification
]
# ---------------------------------------------------------------------------
# Keyword triggers by compliance category
# ---------------------------------------------------------------------------
COMPLIANCE_KEYWORDS = {
"flsa": [
"FLSA", "wage and hour", "overtime violation", "exempt vs nonexempt",
"minimum wage", "off the clock", "meal break violation",
"unpaid overtime", "salary threshold", "wage theft",
"DOL audit", "Department of Labor",
],
"misclassification": [
"1099 vs W-2", "1099 vs W2", "independent contractor",
"misclassification", "misclassified", "contractor or employee",
"IC vs employee", "gig worker classification",
"pay contractor", "paying 1099", "1099 worker",
"contractor to employee", "should I 1099",
],
"discrimination": [
"workplace discrimination", "harassment policy", "Title VII",
"ADA compliance", "hostile work environment", "DEI policy",
"pay equity", "retaliation claim", "EEOC",
],
"privacy": [
"CCPA", "CPRA", "privacy policy", "data privacy", "opt-out request",
"cookie consent", "data breach notification", "biometric data",
"privacy compliance", "do not sell", "consumer rights request",
],
"tcpa": [
"TCPA", "robocall", "SMS marketing", "text message consent",
"do not call", "DNC list", "autodialer",
"prior express written consent", "one-to-one consent",
"SMS campaign sued", "text marketing compliance",
],
"corporate": [
"LLC formation", "form an LLC", "register a business",
"annual report filing", "registered agent", "foreign qualification",
"state registration", "business formation", "incorporate",
"S-Corp election", "C-Corp vs S-Corp", "EIN",
"operating agreement", "good standing",
],
"telecom": [
"FCC 499A", "STIR/SHAKEN", "telecom compliance",
"IPES registration", "ISP registration", "robocall attestation",
"FCC registration", "CLEC", "telecom license",
],
"payroll": [
"payroll compliance", "payroll tax", "W-4", "Form 941",
"employer taxes", "FUTA", "SUTA", "withholding",
"QuickBooks payroll", "Xero payroll", "payroll setup",
],
}
# Flatten all keywords for quick scanning
ALL_KEYWORDS = []
for kws in COMPLIANCE_KEYWORDS.values():
ALL_KEYWORDS.extend(kws)
# ---------------------------------------------------------------------------
# System prompt
# ---------------------------------------------------------------------------
def build_system_prompt() -> str:
return f"""You are Justin, the owner of Performance West (https://performancewest.net),
a compliance consulting firm helping US small and mid-size businesses navigate
employment, privacy, TCPA, corporate, and telecom compliance.
=== PRODUCT FACTS (authoritative — use these exactly, never claim anything not listed) ===
{get_product_facts()}
=== END PRODUCT FACTS ===
You are replying to a Reddit post where someone has a compliance-related question.
QUALIFICATION RULES — if skipping, respond ONLY with "SKIP: <one sentence reason>".
Skip if ANY of these are true:
- The person needs legal advice or legal representation (we are consultants, not attorneys)
- The person mentions they already have an attorney handling this
- The person is involved in active litigation (plaintiff or defendant)
- The person is outside the US (we only serve US businesses)
- The person is at a large enterprise (500+ employees — not our market)
- The question is about tax preparation or CPA-level tax advice
- The question is clearly academic or a student assignment
- Performance West's services would not meaningfully help their specific problem
- The compliance area is NOT covered by our services
REPLY RULES (only if not SKIPped):
- Be genuinely helpful and educational — answer their question first
- Explain the compliance concept clearly in plain language
- Only mention Performance West if it's directly relevant to their situation
- If we have a free tool (FLSA calculator, privacy policy generator, contractor quiz),
mention it naturally — people love free resources
- Never provide legal advice or say "you should do X" — instead explain what the
regulations generally require and suggest they get professional guidance
- Keep it conversational and helpful, not salesy
- Stay under 250 words
- Sign off with a new line and "-- Justin"
- Do NOT use markdown headers or bullet lists — Reddit comments should feel natural"""
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(LOG_DIR / "reddit-monitor.log"),
],
)
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# State
# ---------------------------------------------------------------------------
def load_state():
if STATE_FILE.exists():
try:
return json.loads(STATE_FILE.read_text())
except Exception:
pass
return {
"seen_ids": [],
"replied_ids": [],
"daily_count": 0,
"daily_reset": "",
"subreddit_counts": {},
}
def save_state(state):
STATE_FILE.write_text(json.dumps(state, indent=2))
def reset_daily_if_needed(state):
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
if state.get("daily_reset") != today:
state["daily_count"] = 0
state["daily_reset"] = today
state["subreddit_counts"] = {}
return state
# ---------------------------------------------------------------------------
# Reddit OAuth
# ---------------------------------------------------------------------------
_access_token = None
_token_expiry = 0
def reddit_auth():
"""Get Reddit OAuth2 access token using script-type app credentials."""
global _access_token, _token_expiry
if _access_token and time.time() < _token_expiry:
return _access_token
if not REDDIT_CLIENT_ID or not REDDIT_CLIENT_SECRET:
log.error("Reddit credentials not set in environment")
return None
data = urllib.parse.urlencode({
"grant_type": "password",
"username": REDDIT_USERNAME,
"password": REDDIT_PASSWORD,
}).encode()
# HTTP Basic auth with client_id:client_secret
import base64
credentials = base64.b64encode(f"{REDDIT_CLIENT_ID}:{REDDIT_CLIENT_SECRET}".encode()).decode()
req = urllib.request.Request(
"https://www.reddit.com/api/v1/access_token",
data=data,
headers={
"Authorization": f"Basic {credentials}",
"User-Agent": REDDIT_USER_AGENT,
"Content-Type": "application/x-www-form-urlencoded",
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as r:
resp = json.loads(r.read())
_access_token = resp.get("access_token")
_token_expiry = time.time() + resp.get("expires_in", 3600) - 60
return _access_token
except Exception as e:
log.error(f"Reddit auth failed: {e}")
return None
# ---------------------------------------------------------------------------
# Reddit API helpers
# ---------------------------------------------------------------------------
def reddit_get(path, params=None):
"""GET request to Reddit OAuth API."""
token = reddit_auth()
if not token:
return {}
p = urllib.parse.urlencode(params or {})
url = f"https://oauth.reddit.com{path}"
if p:
url += f"?{p}"
req = urllib.request.Request(
url,
headers={
"Authorization": f"Bearer {token}",
"User-Agent": REDDIT_USER_AGENT,
},
)
try:
with urllib.request.urlopen(req, timeout=15) as r:
return json.loads(r.read())
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:300]
log.warning(f"Reddit GET {path}: {e.code} {body}")
if e.code in (401, 403):
alert_account_broken("reddit-monitor", "Reddit", f"HTTP {e.code}", body)
return {}
except Exception as e:
log.warning(f"Reddit GET {path}: {e}")
return {}
def reddit_post_comment(thing_id, text):
"""Post a comment reply to a Reddit submission or comment."""
token = reddit_auth()
if not token:
return {}
data = urllib.parse.urlencode({
"thing_id": thing_id,
"text": text,
}).encode()
req = urllib.request.Request(
"https://oauth.reddit.com/api/comment",
data=data,
headers={
"Authorization": f"Bearer {token}",
"User-Agent": REDDIT_USER_AGENT,
"Content-Type": "application/x-www-form-urlencoded",
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as r:
return json.loads(r.read())
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:500]
log.warning(f"Reddit POST comment: {e.code} {body}")
if e.code in (401, 403):
alert_account_broken("reddit-monitor", "Reddit", f"HTTP {e.code} on comment", body)
return {"error": e.code, "body": body}
except Exception as e:
log.warning(f"Reddit POST comment: {e}")
return {"error": str(e)}
# ---------------------------------------------------------------------------
# Keyword matching
# ---------------------------------------------------------------------------
def matches_keywords(title: str, body: str) -> list[str]:
"""Return list of matched compliance categories."""
text = (title + " " + body).lower()
matched = []
for category, keywords in COMPLIANCE_KEYWORDS.items():
for kw in keywords:
if kw.lower() in text:
matched.append(category)
break
return matched
# ---------------------------------------------------------------------------
# Ollama
# ---------------------------------------------------------------------------
def generate_reply(title: str, body_text: str, categories: list[str], subreddit: str) -> str:
categories_str = ", ".join(categories)
prompt = f"""Reddit post in r/{subreddit}:
Title: {title}
Body (first 800 chars): {body_text[:800]}
Matched compliance categories: {categories_str}
Write a helpful Reddit comment reply. Answer their question first, then mention Performance West only if directly relevant."""
return ollama_client.generate(prompt, system=build_system_prompt(), max_tokens=350)
# ---------------------------------------------------------------------------
# Post logger
# ---------------------------------------------------------------------------
def log_post(platform, target_url, target_title, content):
posts_log = LOG_DIR / "posts.log"
entry = {
"ts": datetime.now(timezone.utc).isoformat(),
"platform": platform,
"url": target_url,
"title": target_title,
"content": content,
}
with open(posts_log, "a") as f:
f.write(json.dumps(entry) + "\n")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
log.info("=== Reddit monitor starting ===")
state = load_state()
state = reset_daily_if_needed(state)
if state["daily_count"] >= DAILY_LIMIT:
log.info(f"Daily reply limit reached ({DAILY_LIMIT}). Skipping.")
return
# Verify Reddit auth works
token = reddit_auth()
if not token:
log.error("Cannot authenticate with Reddit. Aborting.")
alert_account_broken("reddit-monitor", "Reddit", "OAuth authentication failed")
return
if not ollama_client.start_tunnel():
log.error("Cannot reach Ollama. Aborting.")
return
ollama_client.warmup()
try:
seen_ids = set(state.get("seen_ids", []))
replied_ids = set(state.get("replied_ids", []))
subreddit_counts = state.get("subreddit_counts", {})
replies_this_run = 0
cutoff = time.time() - (MAX_AGE_DAYS * 86400)
# Shuffle subreddits to avoid always hitting the same ones first
subs = list(SUBREDDITS)
random.shuffle(subs)
for subreddit in subs:
if replies_this_run >= MAX_REPLIES_PER_RUN:
break
if state["daily_count"] >= DAILY_LIMIT:
break
if subreddit_counts.get(subreddit, 0) >= MAX_REPLIES_PER_SUBREDDIT:
log.info(f"r/{subreddit}: already replied this run, skipping")
continue
log.info(f"Scanning r/{subreddit}...")
# Fetch new posts from the subreddit
resp = reddit_get(f"/r/{subreddit}/new", {"limit": 25})
posts = resp.get("data", {}).get("children", [])
for post_wrapper in posts:
if replies_this_run >= MAX_REPLIES_PER_RUN:
break
if state["daily_count"] >= DAILY_LIMIT:
break
post = post_wrapper.get("data", {})
post_id = post.get("id", "")
fullname = post.get("name", "") # t3_xxxxx
title = post.get("title", "")
body = post.get("selftext", "")
created = post.get("created_utc", 0)
permalink = post.get("permalink", "")
post_url = f"https://reddit.com{permalink}" if permalink else ""
num_comments = post.get("num_comments", 0)
# Skip if already processed
if post_id in seen_ids or post_id in replied_ids:
continue
# Skip if too old
if created < cutoff:
seen_ids.add(post_id)
continue
# Skip link-only posts (no selftext)
if not body or len(body.strip()) < 30:
seen_ids.add(post_id)
continue
# Check keyword match
categories = matches_keywords(title, body)
if not categories:
seen_ids.add(post_id)
continue
seen_ids.add(post_id)
log.info(f" [r/{subreddit}] '{title[:65]}' (id:{post_id}) categories:{categories}")
# Generate reply with LLM
try:
reply = generate_reply(title, body, categories, subreddit)
except Exception as e:
log.warning(f" Ollama error: {e}")
continue
if not reply or reply.strip().upper().startswith("SKIP"):
skip_reason = reply.strip()[4:].strip(" :-") if reply and len(reply.strip()) > 4 else ""
log.info(f" -> Skipped: {skip_reason or 'not relevant'}")
if skip_reason:
log_gap("Reddit", post_url, title, body[:300], skip_reason)
continue
# Post the reply
log.info(f" -> Posting reply ({len(reply)} chars)...")
result = reddit_post_comment(fullname, reply)
# Check for success — Reddit returns nested jquery structure
if result.get("error"):
err = str(result.get("error", ""))
detail = result.get("body", "")
log.warning(f" -> Failed to post: {err}")
if any(x in str(detail).lower() for x in [
"forbidden", "banned", "suspended", "rate limit",
"unauthorized", "invalid_grant",
]):
alert_account_broken("reddit-monitor", "Reddit", err, detail)
continue
# Success
log.info(f" -> Posted reply to r/{subreddit}")
log_post("Reddit", post_url, title, reply)
replied_ids.add(post_id)
replies_this_run += 1
state["daily_count"] += 1
subreddit_counts[subreddit] = subreddit_counts.get(subreddit, 0) + 1
# Human-like pause between replies
if replies_this_run < MAX_REPLIES_PER_RUN:
pause = random.uniform(
PAUSE_BETWEEN_MIN * 60,
PAUSE_BETWEEN_MAX * 60,
)
log.info(f" -> Pausing {pause/60:.1f} min before next reply...")
time.sleep(pause)
break # Move to next subreddit after replying (1 per sub)
# Don't hammer Reddit API
time.sleep(2)
# Save state — trim seen_ids to prevent unbounded growth
state["seen_ids"] = list(seen_ids)[-5000:]
state["replied_ids"] = list(replied_ids)
state["subreddit_counts"] = subreddit_counts
save_state(state)
log.info(f"=== Done. {replies_this_run} replies posted this run. ===")
finally:
ollama_client.stop_tunnel()
if __name__ == "__main__":
main()