df.groupby(["SITE", "YEAR"]).describe()["COUNT"]


def zip_pmf(x, pi, lambda_):
    if pi < 0 or pi > 1 or lambda_ <= 0:
        return np.zeros_like(x)
    else:
        return (x == 0) * pi + (1 - pi) * stats.poisson.pmf(x, lambda_)

class ZeroInflatedPoisson(GenericLikelihoodModel):
    def __init__(self, endog, exog=None, **kwds):
        """Maximum Likelihood Estimation of a Zero Inflated Poisson model.
        
           source: https://austinrochford.com/posts/2015-03-03-mle-python-statsmodels.html
        """
        if exog is None:
            exog = np.zeros_like(endog)
            
        super(ZeroInflatedPoisson, self).__init__(endog, exog, **kwds)
    
    def nloglikeobs(self, params):
        pi = params[0]
        lambda_ = params[1]

        return -np.log(zip_pmf(self.endog, pi=pi, lambda_=lambda_))
    
    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
        if start_params is None:
            lambda_start = self.endog.mean()
            excess_zeros = (self.endog == 0).mean() - stats.poisson.pmf(0, lambda_start)
            
            start_params = np.array([excess_zeros, lambda_start])
            
        return super(ZeroInflatedPoisson, self).fit(start_params=start_params,
                                                    maxiter=maxiter, maxfun=maxfun, **kwds)

############################################
# Iterate through sites, run chi-square
# goodness-of-fit test for ZIP distribution
############################################

for site in df["SITE"].unique():
    print(site)
    
    # fit zip model to data
    X = df[df["SITE"] == site]["COUNT"]
    model = ZeroInflatedPoisson(X)
    results = model.fit()
    
    # compute ZIP pearson residuals, haven't included a qq plot yet
    # https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Zero-Inflated_Poisson_Regression.pdf
    pi, lambda_ = results.params
    e = (1 - pi)*lambda_
    v = (1 - pi)*lambda_*(1+(pi*lambda_))
    r = (X - e) / np.sqrt(v)

    # compute normalized difference between observed and expected, if the observed fits the expected,
    # the residuals from an observed minus expected and the residuals from a population sampled from the expected distribution
    # should both be normal distributions that pass a normal chi-square goodness-of-fit test (I'm pretty sure...).
    r = (X - e)**2 / e
    
    # compute some sample sum of residuals
    n = len(r)
    n_trials = 1000
    g = np.empty(n_trials)
    s = 0.3
    for ix in range(n_trials):
        sn = np.round(s*n)
        g[ix] = np.sum(r.sample(int(sn))) 
        
    # determine chi-square goodness of fit
    bins = 20
    alpha = 0.01
    observed = np.histogram(g, bins=bins)
    # credit due to W. Weckesser for providing a helpful comment on stack overflow
    https://stackoverflow.com/questions/42888962/chi-squared-goodness-of-fit-test-in-python-way-too-low-p-values-but-the-fittin
    bin_edges = observed[1]
    a1, b1 = stats.norm.fit(g)
    cdf = stats.norm.cdf(bin_edges, a1, b1)
    expected = np.round(len(g) * np.diff(cdf))
    test = stats.chisquare(observed[0], expected, ddof=1)
    print(f"Chi-square goodness-of-fit test result - Reject H0: {test.pvalue < alpha} ({np.round(test.pvalue, 4)})")
    
    # plot histograms
    fig, ax = plt.subplots(figsize=(8,4))
    width = (np.max(observed[1][:-1]) - np.min(observed[1][:-1])) / bins
    ax.bar(observed[1][:-1], observed[0], width=width, alpha=0.5, label="observed")
    ax.bar(observed[1][:-1], expected, width=width, alpha=0.5, label="expected")
    ax.legend()
    fig.tight_layout()
    plt.show()
    plt.close()

Baker North
Optimization terminated successfully.
         Current function value: 3.753068
         Iterations: 36
         Function evaluations: 71
Pearson residual: -0.0075
Chi-square goodness-of-fit test result - Reject H0: False (0.5403)

Baker South
Optimization terminated successfully.
         Current function value: 4.099956
         Iterations: 40
         Function evaluations: 77
Pearson residual: 0.0009
Chi-square goodness-of-fit test result - Reject H0: False (0.9565)

Calf Island
Optimization terminated successfully.
         Current function value: 2.826812
         Iterations: 39
         Function evaluations: 78
Pearson residual: -0.0091
Chi-square goodness-of-fit test result - Reject H0: False (0.469)

Little Brewster
Optimization terminated successfully.
         Current function value: 1.999561
         Iterations: 40
         Function evaluations: 77
Pearson residual: -0.0028
Chi-square goodness-of-fit test result - Reject H0: True (0.0)

NE Appledore
Optimization terminated successfully.
         Current function value: 3.686326
         Iterations: 37
         Function evaluations: 72
Pearson residual: -0.0051
Chi-square goodness-of-fit test result - Reject H0: True (0.0)

NW Appledore
Optimization terminated successfully.
         Current function value: 1.966661
         Iterations: 37
         Function evaluations: 72
Pearson residual: 0.0062
Chi-square goodness-of-fit test result - Reject H0: False (0.0846)

SW Appledore
Optimization terminated successfully.
         Current function value: 1.505738
         Iterations: 42
         Function evaluations: 81
Pearson residual: -0.0007
Chi-square goodness-of-fit test result - Reject H0: True (0.0)

Hurricane Island
Optimization terminated successfully.
         Current function value: 2.318743
         Iterations: 34
         Function evaluations: 67
Pearson residual: -0.0002
Chi-square goodness-of-fit test result - Reject H0: False (0.4478)

Schoodic
Optimization terminated successfully.
         Current function value: 1.940951
         Iterations: 38
         Function evaluations: 74
Pearson residual: -0.0015
Chi-square goodness-of-fit test result - Reject H0: True (0.0039)

Nubble Lighthouse
Optimization terminated successfully.
         Current function value: 1.954186
         Iterations: 39
         Function evaluations: 75
Pearson residual: 0.0011
Chi-square goodness-of-fit test result - Reject H0: True (0.0016)


def etest_zip_2indep(lambda1, pi1, n1, sum1, lambda2, pi2, n2, sum2, ratio_null=1, method='score', 
                     alternative='2-sided', ygrid=None, y_grid=None):
    """E-test for ratio of two sample Zero Inflated Poisson rates. A modification of the 
    etest_poisson_2indep function in statsmodels library, Incorporates estimated ZIP parameters: lambda, pi.
    
    References:
     https://www.statsmodels.org/devel/generated/statsmodels.stats.rates.etest_poisson_2indep.html#statsmodels.stats.rates.etest_poisson_2indep
    """
    d = n2 / n1
    r = ratio_null
    r_d = r / d

    eps = 1e-20  # avoid zero division in stat_func

    if method in ['score']:
        def stat_func(x1, x2):
            return (x1 - x2 * r_d) / np.sqrt((x1 + x2) * r_d + eps)

    elif method in ['wald']:
        def stat_func(x1, x2):
            return (x1 - x2 * r_d) / np.sqrt(x1 + x2 * r_d**2 + eps)
    else:
        raise ValueError('method not recognized')

    stat_sample = stat_func(sum1, sum2)

    if ygrid is not None:
        warnings.warn("ygrid is deprecated, use y_grid", DeprecationWarning)
    y_grid = y_grid if y_grid is not None else ygrid

    # The following uses a fixed truncation for evaluating the probabilities
    # It will currently only work for small counts, so that sf at truncation
    # point is small
    # We can make it depend on the amount of truncated sf.
    # Some numerical optimization or checks for large means need to be added.
    if y_grid is None:
        threshold = stats.poisson.isf(1e-13, max(lambda1, lambda2))
        threshold = max(threshold, 100)   # keep at least 100
        y_grid = np.arange(threshold + 1)
    else:
        y_grid = np.asarray(y_grid)
        if y_grid.ndim != 1:
            raise ValueError("y_grid needs to be None or 1-dimensional array")

    pdf1 = zip_pmf(y_grid, pi=pi1, lambda_=lambda1)
    pdf2 = zip_pmf(y_grid, pi=pi2, lambda_=lambda2)

    stat_space = stat_func(y_grid[:, None], y_grid[None, :])  # broadcasting
    eps = 1e-15   # correction for strict inequality check

    if alternative in ['two-sided', '2-sided', '2s']:
        mask = np.abs(stat_space) >= np.abs(stat_sample) - eps
    elif alternative in ['larger', 'l']:
        mask = stat_space >= stat_sample - eps
    elif alternative in ['smaller', 's']:
        mask = stat_space <= stat_sample + eps
    else:
        raise ValueError('invalid alternative')

    pvalue = ((pdf1[:, None] * pdf2[None, :])[mask]).sum()
    return stat_sample, pvalue




  #######################################################
  # Iterate through sites, run E-test for comparing two 
  # Zero Inflated Poisson means.
  #######################################################

  # E-test paper: https://userweb.ucs.louisiana.edu/~kxk4695/JSPI-04.pdf
  # Zero inflated models: https://en.wikipedia.org/wiki/Zero-inflated_model

  for site in df["SITE"].unique():
    df_site = df[df["SITE"] == site]
    
    years = [
        min(df_site["YEAR"]),
        max(df_site["YEAR"])]
    
    if len(set(years)) < 2:
        print(f"No years for site {site}: {df_site['YEAR'].unique()}")
        continue
    else:
        min_year = years[0]
        max_year = years[1]
        
    # run comparison between earliest and latest year for site
    df1 = df_site[df_site["YEAR"] == min_year]
    df2 = df_site[df_site["YEAR"] == max_year]
    
    # fit zip model to data
    x1 = df1["COUNT"]
    model = ZeroInflatedPoisson(x1)
    pi1, lambda1 = model.fit().params

    # fit zip model to data
    x2 = df2["COUNT"]
    model = ZeroInflatedPoisson(x2)
    pi2, lambda2 = model.fit().params

    # run zip E-test
    alpha = 0.05
    alt = "larger" # "2-sided", "smaller"
    etest = etest_zip_2indep(
        lambda1=lambda1,
        pi1=pi1,
        n1=len(x1),
        sum1=sum(x1),
        lambda2=lambda2,
        pi2=pi2,
        n2=len(x2),
        sum2=sum(x2),
        alternative=alt)
    
    print("-"*25)
    print(site)
    print(f"Min. year ({min_year}) est. lambda: {np.round(lambda1, 4)} (support={len(x1)})")
    print(f"Max. year ({max_year}) est. lambda: {np.round(lambda2, 4)} (support={len(x2)})")
    print(f"Modified Poisson Exact Test for ZIP - Reject H0: {etest[1] < alpha} ({np.round(etest[1], 4)})")
    print("-"*25)

    bins = 30
    hist1 = np.histogram(x1, bins=bins)
    hist2 = np.histogram(x2, bins=bins)
    
    # plot histograms
    fig, ax = plt.subplots(figsize=(8,4))
    width1 = (np.max(hist1[1][:-1]) - np.min(hist1[1][:-1])) / bins
    width2 = (np.max(hist2[1][:-1]) - np.min(hist2[1][:-1])) / bins
    ax.bar(hist1[1][:-1], hist1[0], width=width1, alpha=0.5, label=min_year)
    ax.bar(hist2[1][:-1], hist2[0], width=width2, alpha=0.5, label=max_year)
    ax.legend()
    fig.tight_layout()
    plt.show()
    plt.close()

Optimization terminated successfully.
         Current function value: 6.589366
         Iterations: 33
         Function evaluations: 63
Optimization terminated successfully.
         Current function value: 3.029926
         Iterations: 34
         Function evaluations: 67
-------------------------
Baker North
Min. year (2014) est. lambda: 15.0178 (support=72)
Max. year (2018) est. lambda: 6.0679 (support=106)
Modified Poisson Exact Test for ZIP - Reject H0: True (0.0)
-------------------------

Optimization terminated successfully.
         Current function value: 7.912684
         Iterations: 30
         Function evaluations: 60
Optimization terminated successfully.
         Current function value: 2.792870
         Iterations: 36
         Function evaluations: 69
-------------------------
Baker South
Min. year (2014) est. lambda: 14.4416 (support=89)
Max. year (2018) est. lambda: 4.1148 (support=114)
Modified Poisson Exact Test for ZIP - Reject H0: True (0.0)
-------------------------

Optimization terminated successfully.
         Current function value: 2.493912
         Iterations: 39
         Function evaluations: 76
Optimization terminated successfully.
         Current function value: 2.399326
         Iterations: 37
         Function evaluations: 71
-------------------------
Calf Island
Min. year (2014) est. lambda: 5.2408 (support=138)
Max. year (2017) est. lambda: 4.5515 (support=96)
Modified Poisson Exact Test for ZIP - Reject H0: False (0.2638)
-------------------------

Optimization terminated successfully.
         Current function value: 2.309742
         Iterations: 37
         Function evaluations: 74
Optimization terminated successfully.
         Current function value: 1.875434
         Iterations: 36
         Function evaluations: 73
-------------------------
Little Brewster
Min. year (2014) est. lambda: 5.8085 (support=96)
Max. year (2017) est. lambda: 2.9593 (support=54)
Modified Poisson Exact Test for ZIP - Reject H0: True (0.0)
-------------------------

Optimization terminated successfully.
         Current function value: 2.522298
         Iterations: 38
         Function evaluations: 75
Optimization terminated successfully.
         Current function value: 3.424512
         Iterations: 38
         Function evaluations: 75
-------------------------
NE Appledore
Min. year (2014) est. lambda: 4.65 (support=150)
Max. year (2018) est. lambda: 7.1346 (support=90)
Modified Poisson Exact Test for ZIP - Reject H0: False (1.0)
-------------------------

Optimization terminated successfully.
         Current function value: 1.655086
         Iterations: 40
         Function evaluations: 79
Optimization terminated successfully.
         Current function value: 1.768946
         Iterations: 39
         Function evaluations: 76
-------------------------
NW Appledore
Min. year (2014) est. lambda: 3.6782 (support=138)
Max. year (2018) est. lambda: 2.5539 (support=72)
Modified Poisson Exact Test for ZIP - Reject H0: False (0.5588)
-------------------------

Optimization terminated successfully.
         Current function value: 1.695983
         Iterations: 43
         Function evaluations: 83
Optimization terminated successfully.
         Current function value: 1.640015
         Iterations: 43
         Function evaluations: 86
-------------------------
SW Appledore
Min. year (2014) est. lambda: 2.8785 (support=114)
Max. year (2018) est. lambda: 3.775 (support=70)
Modified Poisson Exact Test for ZIP - Reject H0: False (0.7356)
-------------------------

No years for site Hurricane Island: [2017]
No years for site Schoodic: [2017]
Optimization terminated successfully.
         Current function value: 1.395149
         Iterations: 42
         Function evaluations: 80
Optimization terminated successfully.
         Current function value: 1.433056
         Iterations: 37
         Function evaluations: 68
-------------------------
Nubble Lighthouse
Min. year (2014) est. lambda: 3.1971 (support=42)
Max. year (2017) est. lambda: 1.4986 (support=108)
Modified Poisson Exact Test for ZIP - Reject H0: False (0.6212)
-------------------------

Optimization terminated successfully.
         Current function value: 3.967357
         Iterations: 31
         Function evaluations: 61
Optimization terminated successfully.
         Current function value: 2.594140
         Iterations: 34
         Function evaluations: 69
-------------------------
Nahant
Min. year (2015) est. lambda: 7.9459 (support=48)
Max. year (2018) est. lambda: 4.6207 (support=90)
Modified Poisson Exact Test for ZIP - Reject H0: True (0.0)
-------------------------

Optimization terminated successfully.
         Current function value: 2.405481
         Iterations: 33
         Function evaluations: 67
Optimization terminated successfully.
         Current function value: 2.072086
         Iterations: 37
         Function evaluations: 70
-------------------------
Pemaquid
Min. year (2016) est. lambda: 4.6389 (support=96)
Max. year (2017) est. lambda: 3.1823 (support=90)
Modified Poisson Exact Test for ZIP - Reject H0: True (0.0)
-------------------------

		count	mean	std	min	25%	50%	75%	max
SITE	YEAR
Baker North	2014	71.0	11.000000	12.256777	0.0	1.0	6.0	20.00	43.0
	2015	143.0	4.825175	6.913260	0.0	0.0	2.0	7.00	37.0
	2016	144.0	1.048611	3.558364	0.0	0.0	0.0	0.00	36.0
	2017	120.0	4.791667	4.924464	0.0	1.0	3.0	7.00	20.0
	2018	106.0	3.500000	5.304535	0.0	0.0	1.0	4.75	21.0
Baker South	2014	88.0	11.727273	13.112008	0.0	2.0	6.5	20.25	50.0
	2015	144.0	3.145833	5.679024	0.0	0.0	0.0	3.25	32.0
	2016	144.0	1.388889	3.008406	0.0	0.0	0.0	1.00	15.0
	2017	108.0	5.462963	9.152321	0.0	0.0	1.0	6.00	47.0
	2018	114.0	2.605263	4.644186	0.0	0.0	1.0	3.00	38.0
Calf Island	2014	138.0	2.557971	4.629354	0.0	0.0	0.0	3.00	30.0
	2015	159.0	3.446541	5.545722	0.0	0.0	1.0	4.00	32.0
	2016	174.0	1.632184	3.340468	0.0	0.0	0.0	1.75	23.0
	2017	96.0	2.395833	4.100011	0.0	0.0	1.0	3.00	25.0
Fort Weatherill	2016	90.0	2.555556	5.348760	0.0	0.0	1.0	3.00	40.0
	2017	78.0	1.410256	2.802763	0.0	0.0	0.0	1.75	17.0
	2018	60.0	3.516667	5.074000	0.0	0.0	2.0	4.25	25.0
Hurricane Island	2017	90.0	3.033333	4.407476	0.0	0.0	1.0	4.75	22.0
Little Brewster	2014	96.0	2.791667	4.367142	0.0	0.0	0.0	4.00	21.0
	2015	162.0	1.790123	3.269342	0.0	0.0	0.0	2.00	18.0
	2016	162.0	1.462963	3.990889	0.0	0.0	0.0	1.00	28.0
	2017	54.0	1.907407	2.174193	0.0	0.0	1.0	3.00	7.0
NE Appledore	2014	150.0	1.846667	5.154963	0.0	0.0	0.0	2.00	36.0
	2015	174.0	4.614943	7.464444	0.0	0.0	2.0	6.00	50.0
	2016	72.0	2.027778	3.476165	0.0	0.0	1.0	2.25	16.0
	2017	84.0	3.392857	5.581851	0.0	0.0	1.0	5.00	32.0
	2018	90.0	4.522222	6.358623	0.0	0.0	2.0	7.00	31.0
NW Appledore	2014	138.0	1.449275	2.682935	0.0	0.0	0.0	2.00	15.0
	2015	156.0	1.583333	2.406867	0.0	0.0	0.0	2.00	14.0
	2016	48.0	2.145833	2.946253	0.0	0.0	1.0	3.00	13.0
	2017	108.0	2.351852	4.736643	0.0	0.0	0.0	3.00	35.0
	2018	72.0	1.500000	2.213912	0.0	0.0	1.0	2.00	13.0
Nahant	2015	48.0	6.458333	7.216878	0.0	2.0	4.0	8.00	40.0
	2016	36.0	2.722222	3.637983	0.0	0.0	1.0	4.25	12.0
	2017	102.0	3.656863	3.465516	0.0	1.0	3.0	5.00	15.0
	2018	90.0	2.488889	4.515254	0.0	0.0	1.0	3.00	25.0
Nubble Lighthouse	2014	42.0	0.952381	2.378280	0.0	0.0	0.0	1.00	10.0
	2015	36.0	3.555556	4.999683	0.0	0.0	2.0	5.00	25.0
	2017	108.0	1.018519	1.394123	0.0	0.0	1.0	2.00	7.0
Pemaquid	2016	96.0	2.781250	3.750658	0.0	0.0	2.0	4.00	18.0
Pemaquid	2017	90.0	1.844444	2.941035	0.0	0.0	1.0	3.00	19.0
SW Appledore	2014	114.0	1.070175	3.311870	0.0	0.0	0.0	1.00	33.0
	2015	168.0	0.750000	1.607430	0.0	0.0	0.0	1.00	10.0
	2016	78.0	0.615385	1.676660	0.0	0.0	0.0	0.00	9.0
	2017	78.0	1.653846	3.592297	0.0	0.0	0.5	2.00	27.0
	2018	70.0	1.214286	3.202774	0.0	0.0	0.0	1.00	22.0
Schoodic	2017	126.0	1.841270	2.877256	0.0	0.0	0.5	3.00	17.0

Modeling Kelp Observations as a Zero-Inflated Poisson Process

Introduction

Kelp Observation Dataset

Modeling Kelp Populations as a Zero-Inflated Poisson Process

Exploratory Analysis

Parameterizing the Model

Determing Goodness-of-Fit Between Theoretical Model and Observed Data

Implementation

Exact Hypothesis Test (E-Test)

Observations

Discussion

Acknowledgements

References