add balance/mean-reversion signal and Cloudflare visitor logging

Balance signal (15% weight) favors under-represented chairs over last 50 games. Visitor middleware captures real IPs from CF headers, batched into ClickHouse with 90-day TTL.
2026-02-26 09:59:27 +05:00
parent 5fd4894599
commit 86865166ef
4 changed files with 135 additions and 9 deletions
--- a/app/db.py
+++ b/app/db.py
@@ -61,6 +61,15 @@ def run_migrations():
        )
        client.insert("_migrations", [["swap_ac_chairs"]], column_names=["name"])
        log.info("Migration swap_ac_chairs applied")
+    # Ensure visitors table exists (for existing deployments)
+    client.command(
+        "CREATE TABLE IF NOT EXISTS visitors ("
+        "  ip String, country String, path String, method String,"
+        "  user_agent String, referer String, accept_lang String,"
+        "  created_at DateTime DEFAULT now()"
+        ") ENGINE = MergeTree() ORDER BY (created_at, ip)"
+        " TTL created_at + INTERVAL 90 DAY"
+    )
    _migrations_applied = True


@@ -143,6 +152,31 @@ def upsert_user(user: dict):
    )


+@_with_lock
+def insert_visitors(batch: list[dict]):
+    """Bulk insert visitor records."""
+    if not batch:
+        return
+    client = get_client()
+    rows = [
+        [
+            v.get("ip", ""),
+            v.get("country", ""),
+            v.get("path", ""),
+            v.get("method", ""),
+            v.get("user_agent", ""),
+            v.get("referer", ""),
+            v.get("accept_lang", ""),
+        ]
+        for v in batch
+    ]
+    client.insert(
+        "visitors",
+        rows,
+        column_names=["ip", "country", "path", "method", "user_agent", "referer", "accept_lang"],
+    )
+
+
@_with_lock
 def get_recent_games(n: int = 50) -> list[dict]:
    """Get last N completed games."""
@@ -1000,8 +1034,17 @@ def _bayesian_prediction(winners, markov1, markov2):
    else:
        streak = {c: 1 / 3 for c in CHAIR_LABELS}

-    weights = {"base_rate": 0.20, "markov_1": 0.30, "markov_2": 0.25, "recent_20": 0.15, "streak": 0.10}
-    signals = {"base_rate": base, "markov_1": m1, "markov_2": m2, "recent_20": rec, "streak": streak}
+    # Signal 6: Balance / Mean Reversion — 15%
+    # Look at last 50 games, invert frequencies to favor under-represented chairs
+    window = min(50, len(winners))
+    recent_50 = winners[-window:]
+    freq = {c: recent_50.count(c) / window for c in CHAIR_LABELS}
+    balance = {c: max(0.01, 2 / 3 - freq[c]) for c in CHAIR_LABELS}
+    bal_total = sum(balance.values())
+    balance = {c: balance[c] / bal_total for c in CHAIR_LABELS}
+
+    weights = {"base_rate": 0.15, "markov_1": 0.25, "markov_2": 0.25, "recent_20": 0.10, "streak": 0.10, "balance": 0.15}
+    signals = {"base_rate": base, "markov_1": m1, "markov_2": m2, "recent_20": rec, "streak": streak, "balance": balance}

    combined = {c: 0 for c in CHAIR_LABELS}
    for sig_name, weight in weights.items():
@@ -1122,7 +1165,7 @@ def _backtest_theories(winners):
    if len(winners) <= warmup:
        return {"error": "Not enough data for backtesting"}

-    theories = ["base_rate", "markov_1", "markov_2", "recent_20", "streak", "combined"]
+    theories = ["base_rate", "markov_1", "markov_2", "recent_20", "streak", "balance", "combined"]
    full_hits = {t: 0 for t in theories}
    semi_hits = {t: 0 for t in theories}
    total_tested = 0
@@ -1176,10 +1219,19 @@ def _backtest_theories(winners):
            streak_probs = {c: 1 / 3 for c in CHAIR_LABELS}
        streak_ranked = sorted(CHAIR_LABELS, key=lambda c: streak_probs[c], reverse=True)

+        # Balance / Mean Reversion
+        bal_window = min(50, len(history))
+        bal_recent = history[-bal_window:]
+        bal_freq = {c: bal_recent.count(c) / bal_window for c in CHAIR_LABELS}
+        bal_probs = {c: max(0.01, 2 / 3 - bal_freq[c]) for c in CHAIR_LABELS}
+        bal_t = sum(bal_probs.values())
+        bal_probs = {c: bal_probs[c] / bal_t for c in CHAIR_LABELS}
+        bal_ranked = sorted(CHAIR_LABELS, key=lambda c: bal_probs[c], reverse=True)
+
        # Combined Bayesian
        combined = {c: 0 for c in CHAIR_LABELS}
-        weights = {"base_rate": 0.20, "markov_1": 0.30, "markov_2": 0.25, "recent_20": 0.15, "streak": 0.10}
-        signals = {"base_rate": base, "markov_1": m1_probs, "markov_2": m2_probs, "recent_20": rec, "streak": streak_probs}
+        weights = {"base_rate": 0.15, "markov_1": 0.25, "markov_2": 0.25, "recent_20": 0.10, "streak": 0.10, "balance": 0.15}
+        signals = {"base_rate": base, "markov_1": m1_probs, "markov_2": m2_probs, "recent_20": rec, "streak": streak_probs, "balance": bal_probs}
        for sig_name, weight in weights.items():
            for c in CHAIR_LABELS:
                combined[c] += weight * signals[sig_name].get(c, 1 / 3)
@@ -1187,7 +1239,8 @@ def _backtest_theories(winners):

        ranked = {
            "base_rate": base_ranked, "markov_1": m1_ranked, "markov_2": m2_ranked,
-            "recent_20": rec_ranked, "streak": streak_ranked, "combined": combined_ranked,
+            "recent_20": rec_ranked, "streak": streak_ranked, "balance": bal_ranked,
+            "combined": combined_ranked,
        }
        for t in theories:
            pick = ranked[t][0]