# Phil predictions: fit iid + 1st-order Markov, then streak + simulation summaries
# Run interactively; objects are created (no printing).
# CSV expected columns: year,prediction
# prediction values: "More winter", "Early spring", optionally "No prediction"

csv_path <- "_data/groundhog.csv"  # change if needed
set.seed(1)

# ----------------------------
# 0) Load + filter data
# ----------------------------
df_raw <- read.csv(csv_path, stringsAsFactors = FALSE)

df <- df_raw[df_raw$year >= 1900 & df_raw$prediction %in% c("More winter", "Early spring"), ]
df <- df[order(df$year), ]

state <- ifelse(df$prediction == "Early spring", "E", "M")  # E/M coding
Tlen <- length(state)
stopifnot(Tlen >= 2)

# ----------------------------
# 1) Model A: iid
# ----------------------------
pE_iid <- mean(state == "E")
pM_iid <- 1 - pE_iid

# ----------------------------
# 2) Model B: 1st-order Markov (MLE)
# ----------------------------
states <- c("E", "M")
idx <- match(state, states)

counts <- matrix(0L, nrow = 2, ncol = 2, dimnames = list(from = states, to = states))
for (t in 1:(Tlen - 1)) {
  counts[idx[t], idx[t + 1]] <- counts[idx[t], idx[t + 1]] + 1L
}
P_markov <- sweep(counts, 1, rowSums(counts), FUN = "/")  # transition matrix

# ============================================================
# 3) STREAKS + SIMULATIONS (under each model)
# ============================================================

# ---- helpers ----
longest_run_of <- function(x, value) {
  r <- rle(x)
  if (!any(r$values == value)) return(0L)
  max(r$lengths[r$values == value])
}

count_switches <- function(x) sum(x[-1] != x[-length(x)])

summarize_sims <- function(v, obs) {
  c(
    mean = mean(v),
    p05  = unname(quantile(v, 0.05)),
    p50  = unname(quantile(v, 0.50)),
    p95  = unname(quantile(v, 0.95)),
    p99  = unname(quantile(v, 0.99)),
    p_ge_obs = mean(v >= obs)
  )
}

simulate_markov <- function(Tlen, P, init) {
  x <- character(Tlen)
  x[1] <- init
  for (t in 2:Tlen) {
    if (x[t - 1] == "E") {
      x[t] <- sample(c("E", "M"), 1, prob = P["E", ])
    } else {
      x[t] <- sample(c("E", "M"), 1, prob = P["M", ])
    }
  }
  x
}

simulate_iid <- function(Tlen, pE) {
  sample(c("E", "M"), Tlen, replace = TRUE, prob = c(pE, 1 - pE))
}

# ---- observed stats ----
obs <- list(
  T = Tlen,
  nE = sum(state == "E"),
  nM = sum(state == "M"),
  switches = count_switches(state),
  longest_M = longest_run_of(state, "M"),
  longest_E = longest_run_of(state, "E")
)

# ---- simulations ----
N <- 20000
init_state <- state[1]

sim_markov <- list(
  longest_M = integer(N),
  switches  = integer(N),
  nE        = integer(N)
)

sim_iid <- list(
  longest_M = integer(N),
  switches  = integer(N),
  nE        = integer(N)
)

for (i in 1:N) {
  xm <- simulate_markov(Tlen, P_markov, init_state)
  sim_markov$longest_M[i] <- longest_run_of(xm, "M")
  sim_markov$switches[i]  <- count_switches(xm)
  sim_markov$nE[i]        <- sum(xm == "E")
  
  xi <- simulate_iid(Tlen, pE_iid)
  sim_iid$longest_M[i] <- longest_run_of(xi, "M")
  sim_iid$switches[i]  <- count_switches(xi)
  sim_iid$nE[i]        <- sum(xi == "E")
}

# ---- summaries (compare to observed) ----
summaries <- list(
  markov = list(
    longest_M = summarize_sims(sim_markov$longest_M, obs$longest_M),
    switches  = summarize_sims(sim_markov$switches,  obs$switches),
    nE        = summarize_sims(sim_markov$nE,        obs$nE)
  ),
  iid = list(
    longest_M = summarize_sims(sim_iid$longest_M, obs$longest_M),
    switches  = summarize_sims(sim_iid$switches,  obs$switches),
    nE        = summarize_sims(sim_iid$nE,        obs$nE)
  )
)

# Key objects you can inspect in the console:
# df_raw, df, state
# pE_iid, pM_iid
# counts, P_markov
# obs
# sim_markov, sim_iid
# summaries