library(httr)
library(jsonlite)
library(maps)
library(spatstat.geom)
library(spatstat.explore)
library(splines)
library(mgcv)

# pull usgs earthquake data for california in january 2025
base <- "https://earthquake.usgs.gov/fdsnws/event/1/query"
q <- list(
  format = "geojson",
  starttime = "2026-01-01",
  endtime   = "2026-02-01",
  minlatitude = 32.4, maxlatitude = 42.1,
  minlongitude = -124.6, maxlongitude = -114.1,
  minmagnitude = 2.5,
  orderby = "time-asc",
  limit = 20000
)
res <- GET(base, query = q)
stop_for_status(res)
x <- content(res, as = "text", encoding = "UTF-8")
gj <- fromJSON(x, simplifyVector = FALSE)

# flatten features
feat <- gj$features
df <- data.frame(
  time = as.POSIXct(vapply(feat, \(f) f$properties$time, numeric(1)) / 1000,
                    origin = "1970-01-01", tz = "UTC"),
  mag  = vapply(feat, \(f) f$properties$mag,  numeric(1)),
  place= vapply(feat, \(f) f$properties$place, character(1)),
  lon  = vapply(feat, \(f) f$geometry$coordinates[[1]], numeric(1)),
  lat  = vapply(feat, \(f) f$geometry$coordinates[[2]], numeric(1)),
  depth_km = vapply(feat, \(f) f$geometry$coordinates[[3]], numeric(1))
)

# time range
range(df$time)

# magnitudes
hist(df$mag, breaks = 10, 
     main = 'magnitudes', xlab = 'Magnitude')

# event times
t <- df$time
m <- df$mag
o <- order(t)
t <- t[o]; m <- m[o]
plot(range(t), c(0, 1), type = "n",
     yaxt = "n", ylab = "", xlab = "Time",
     main = "event raster")
abline(v = t)

# high magnitude events
m_cut <- 4.0  
plot(range(t), c(0, 1), type = "n",
     yaxt = "n", ylab = "", xlab = "Time",
     main = "event raster")
abline(v = t, col = 'grey', lwd = 1)
abline(v = t[m >= m_cut], col = "red3", lwd = 1)

# spatial scatter
pad <- 2
xlim <- range(df$lon, na.rm = TRUE) + c(-pad, pad)
ylim <- range(df$lat, na.rm = TRUE) + c(-pad, pad)
map("state", fill = FALSE, col = "grey40", lwd = 1,
    xlim = xlim, ylim = ylim)
points(df$lon, df$lat, pch = 16,
       cex = pmax(0.4, (df$mag - min(df$mag, na.rm = TRUE) + 0.5) / 2))

# highlight high magnitude
points(df$lon[df$mag >= m_cut], df$lat[df$mag >= m_cut],
       pch = 16, col = "red3",
       cex = 1.2)

## CHECK FOR SPATIAL CLUSTERING ------------------------------------------------

# event locations
x <- df$lon
y <- df$lat

# observation window + point pattern
W <- owin(xrange = range(x, na.rm = TRUE), yrange = range(y, na.rm = TRUE))
X <- ppp(x, y, window = W)

# nearest-neighbor distances
nn <- nndist(X)

# book does this: CI for mean NN distance under CSR
n <- npoints(X)
lambda_hat <- intensity(X)
mu_hat  <- 1 / (2 * sqrt(lambda_hat))
var_hat <- (4 - pi) / (4 * pi * lambda_hat)
se_hat  <- sqrt(var_hat / n)
ci <- mu_hat + c(-1, 1) * qnorm(0.975) * se_hat

# book does this: compare mean under CSR with empirical mean NN distance
c(ci = ci, empirical = mean(nn))

# alternative: Q-Q plot vs CSR 
p <- ppoints(length(nn))
q_theory <- sqrt(-log(1 - p) / (lambda_hat * pi))
qqplot(q_theory, sort(nn),
       xlab = "Fitted", ylab = "Observed",
       main = "Q-Q plot: NN distances")
abline(0, 1, lty = 2)

# KS test for whether nn distances are weibull
F_csr <- function(r) 1 - exp(-lambda_hat * pi * r^2)
nn_j <- jitter(nn, amount = 1e-10)
ks.test(nn_j, F_csr)

# ... or use squared distances
dsq <- nn^2
rate <- lambda_hat * pi
qqplot(qexp(ppoints(length(dsq)), rate = rate), sort(dsq),
       xlab = "Fitted",
       ylab = "Observed",
       main = "Q-Q plot: squared NN distances")
abline(0, 1, lty = 2)

# KS test (jitter to avoid ties warning)
dsq_j <- jitter(dsq, amount = 1e-10)
ks.test(dsq_j, "pexp", rate = rate)

## CHECK FOR TEMPORAL CLUSTERING -----------------------------------------------

# count process
df <- df[order(df$time), ]
plot(df$time, seq_len(nrow(df)), type = "s",
     xlab = "Time", ylab = expression(N[t]))

# arrival and interarrival times
t <- sort(as.numeric(df$time))
dt <- diff(t)

# MLE for intensity
lambda_hat <- 1 / mean(dt)

# histogram + fitted exponential density
hist(dt, breaks = 50, freq = FALSE,
     main = "Interarrival times", xlab = "dt (seconds)")
curve(dexp(x, rate = lambda_hat), add = TRUE, lty = 2)

# Q-Q plot relative to exponential distribution
qqplot(qexp(ppoints(length(dt)), rate = lambda_hat), dt,
       xlab = "Fitted", ylab = "Observed",
       main = "Q-Q plot")
abline(0, 1, lty = 2)

# zoom in a bit
# under the y=x line indicates more short interarrival times than expected
qqplot(qexp(ppoints(length(dt)), rate=lambda_hat), dt,
       xlab="Fitted", ylab="Observed",
       main="Q-Q vs Exp (zoomed in)",
       xlim=c(0, quantile(qexp(ppoints(length(dt)), rate=lambda_hat), 0.75)),
       ylim=c(0, quantile(dt, 0.75)))
abline(0,1,lty=2)

qqplot(qexp(ppoints(length(dt)), rate=lambda_hat), dt, log="xy", 
       xlab="Fitted", ylab="Observed",
       main="Q-Q vs Exp (log-log)")
abline(0,1,lty=2)

# KS test for whether interarrival times are exponential
ks.test(dt, "pexp", rate = lambda_hat)

## FIT NONHOMOGENEOUS TEMPORAL POISSON MODEL -----------------------------------

# rescale (seconds -> days) and reindex
t <- sort(as.numeric(df$time))
t0 <- min(t)
t_days <- (t - t0) / 86400

# bin time
breaks <- seq(0, max(t_days), length.out = 201)
y <- hist(t_days, breaks = breaks, plot = FALSE)$counts
dt <- diff(breaks)
t_mid <- (breaks[-1] + breaks[-length(breaks)]) / 2

# data for modeling
dfb <- data.frame(y = y, t = t_mid, dt = dt)

# sorted event times in same units
t_event <- t_days

# ---- Poisson GLM with regression spline ----
fit <- glm(y ~ ns(t, df = 10),
           family = poisson(),
           offset = log(dfb$dt),
           data = dfb)

# intensity function
mu_hat <- predict(fit, type = "response")   # counts/bin
lambda_hat <- mu_hat / dfb$dt               # events/day
plot(t_mid, lambda_hat, type = "l", col = 'blue',
     xlab = "t (days)", ylab = expression(hat(lambda)(t)),
     main = "Intensity (GLM fit)")

# cumulative intensity
Lambda_hat <- c(0, cumsum(mu_hat))
N_obs <- findInterval(breaks, t_event)

plot(breaks, Lambda_hat, type = "l", col = 'blue',
     xlab = "t (days)", ylab = expression(hat(Lambda)(t)),
     main = "Cumulative intensity (GLM fit)")
lines(breaks, N_obs, type = "s", lwd = 1)

# ---- Poisson GAM ----
fit_gam <- gam(y ~ s(t, bs = "cr", k = 20) + offset(log(dt)),
               family = poisson(),
               method = "REML",
               data = dfb)

# intensity function
mu_hat_gam <- predict(fit_gam, type = "response")  # counts/bin
lambda_hat_gam <- mu_hat_gam / dfb$dt              # events/day
plot(t_mid, lambda_hat_gam, type = "l",
     xlab = "t (days)", ylab = expression(hat(lambda)(t)),
     main = "Intensity (GAM fit)")

# cumulative intensity
Lambda_hat_gam <- c(0, cumsum(mu_hat_gam))
plot(breaks, Lambda_hat_gam, type = "s", col = 'blue',
     xlab = "t (days)", ylab = expression(hat(Lambda)(t)),
     main = "Cumulative intensity (GAM fit)")
lines(breaks, N_obs, type = "s", lwd = 1)

## FIT DIAGNOSTICS FOR NHPP ----------------------------------------------------

# helper: piecewise-linear interpolation of cumulative intensity
# (to get estimated intensity at observed event times)
Lambda_hat_interp <- function(t) approx(breaks, Lambda_hat_gam, xout = t,
                                 rule = 2, ties = "ordered")$y

# rescaled interarrival times between fitted values
u <- diff(Lambda_hat_interp(t_event))

# under the poisson model, P(U < 0.1) = 1 - exp(-0.1)
pexp(0.1)

# compare
mean(u < 0.1)

# Q-Q vs Exp(1) (log-log): still many "bursts" (smaller-than-expected times)
qqplot(qexp(ppoints(length(u)), rate = 1), u, log = 'xy',
       xlab = "Expected", ylab = "Observed",
       main = "Q-Q: rescaled interarrivals (GAM fit)e")
abline(0, 1, lty = 2)

# ks test
ks.test(u, 'pexp', rate = 1)

## now what???