# Import required pacakages
import numpy as np
import pandas as pd
import os
import altair as alt
from vega_datasets import data

# Handle large data sets without embedding them in the notebook
alt.data_transformers.enable("data_server")
alt.renderers.enable('mimetype')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score


# Load sap measurement data
norm_path = os.path.join(os.pardir, "data", "processed", "stinson2019", "norm_tables")
derived_path = os.path.join(os.pardir, "data", "processed", "stinson2019", "derived_tables")
sap_sugar_df = pd.read_pickle(os.path.join(derived_path, 'sap_sugar_weekly_summary'))

# Load freeze/thaw and growing degree data for a given weather station and associate with a sap measurement site
gdd_frthw = pd.read_pickle(os.path.join(derived_path,'gdd_frthw'))
closest_weather_stn = pd.read_pickle(os.path.join(norm_path, 'closest_weather_stn'))
gdd_frthw = gdd_frthw.merge(closest_weather_stn.reset_index(), how = 'left', on= 'stn_id')
gdd_frthw = gdd_frthw.set_index('datetime')

# Load information on sap measurement sites
location = pd.read_pickle(os.path.join(norm_path, 'location'))
weather_stn = pd.read_pickle(os.path.join(norm_path, 'weather_stn'))


# **TODO**
# Get map showing outlines of Canadian Provinces and US States.
# **TODO**

site_points = alt.Chart(location).mark_circle().encode(
    longitude='lon:Q',
    latitude='lat:Q',
    size=alt.value(30)
)

site_labels_east = alt.Chart(location[location.lon>-80]).mark_text(
    align='left',
    baseline='middle',
    dx=5,
    dy=-3
).encode(
    longitude='lon:Q',
    latitude='lat:Q',
    text='long_name',
    size= alt.value(10)
)

site_labels_west = alt.Chart(location[location.lon<=-80]).mark_text(
    align='right',
    baseline='middle',
    dx=-5,
    dy=-3
).encode(
    longitude='lon:Q',
    latitude='lat:Q',
    text='long_name',
    size= alt.value(10)
)

site_plt = site_points + site_labels_east + site_labels_west

# Weather station points and text

stn_points = alt.Chart(weather_stn).mark_circle(color='black').encode(
    longitude='lon:Q',
    latitude='lat:Q',
    size=alt.value(10)
)

stn_labels_east = alt.Chart(weather_stn[weather_stn.lon>-80]).mark_text(
    align='left',
    baseline='middle',
    dx=5,
    dy=+7,
    color='grey'
).encode(
    longitude='lon:Q',
    latitude='lat:Q',
    text='stn_name',
    size= alt.value(8)
)

stn_labels_west = alt.Chart(weather_stn[weather_stn.lon<=-80]).mark_text(
    align='right',
    baseline='middle',
    dx=-5,
    dy=+7,
    color='grey'
).encode(
    longitude='lon:Q',
    latitude='lat:Q',
    text='stn_name',
    size= alt.value(8)
)

stn_plt = stn_points + stn_labels_east + stn_labels_west

# states = alt.topo_feature(data.us_10m.url, feature='states')
states = alt.topo_feature(data.world_110m.url, 'countries')

background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
)

(background + site_plt + stn_plt).project('equirectangular',
                                        scale=1200,
                                        translate=[2200, 1100]
                                       ).properties(width=1000,height=600)


# Create a dataframe (`full_sap`) which contains information on both sap flow and the growing degree days and freeze/thaw cycles for all tap/year combinations

full_sap = pd.DataFrame(
    columns=[
        "tap_id",
        "date_from",
        "date_to",
        "weekly_sugarwt",
        "weekly_sap",
        "site",
        "cumGDD",
        "cum_frthw",
        "sap_binary",
    ]
)

for site in sap_sugar_df.site.unique():
    sap_sugar_site = sap_sugar_df[sap_sugar_df.site == site]
    for tap in sap_sugar_site.tap_id.unique():
        sap_sugar_tap = sap_sugar_site[sap_sugar_site.tap_id == tap]
        for year in sap_sugar_tap.date_to.dt.year.unique():
            sap_sugar_year = sap_sugar_tap[sap_sugar_tap.date_to.dt.year == year].drop(
                columns=["site"]
            )

            # Merge weather from local station with sap measurements for a given tap and year
            sap_sugar_year = sap_sugar_year.merge(
                gdd_frthw[(gdd_frthw.site == site) & (gdd_frthw.index.year == year)],
                how="right",
                left_on=["date_to"],
                right_index=True,
            )

            # Add missing dates, tap_ids to df
            sap_sugar_year.loc[:, "date_from"] = sap_sugar_year.loc[
                :, "date_to"
            ] - pd.DateOffset(6)
            sap_sugar_year["weather_datetime"] = sap_sugar_year.date_to
            sap_sugar_year["tap_id"] = sap_sugar_year["tap_id"].fillna(tap)

            # Fill in missing 0 values
            sap_sugar_year.loc[:, "weekly_sugarwt"] = sap_sugar_year.loc[
                :, "weekly_sugarwt"
            ].fillna(0)
            sap_sugar_year.loc[:, "weekly_sap"] = sap_sugar_year.loc[
                :, "weekly_sap"
            ].fillna(0)

            #             # For future implementation
            #             # Add total weekly freeze-thaw cycles column (week ending on date 'date_to')
            #             sap_sugar_year.loc[:, "weekly_frthw"] = sap_sugar_year.loc[:, "cum_frthw"] - sap_sugar_year.loc[:, "cum_frthw"].shift(6)
            sap_sugar_year = sap_sugar_year.rename(columns={'frthw':'cum_frthw'})
            

            # Create row indicating if there is (1) or is not (0) sap flow in a given week (week ending on 'date_to')
            sap_sugar_year["sap_binary"] = sap_sugar_year["weekly_sap"].apply(
                lambda x: 0 if ((pd.isnull(x)) | (x == 0)) else 1
            )

            full_sap = full_sap.append(sap_sugar_year)

# Coerce sap_binary column to numpy.int64 for subesquent analyses
full_sap.sap_binary = full_sap.sap_binary.astype("int")

# Drop columns not used in current analysis and reset index
full_sap = full_sap.drop(columns=['weekly_sugarwt', 'weekly_sap','mean_airt'])
full_sap.reset_index(inplace=True, drop=True)


full_sap.head()


# Coefficients of linear model created by Houle 2015 and threshold for predicting 'True' for sap flow from logistic function
houle_coeff = np.array([-5.09, 0.733, -0.014, -0.07])
houle_thresh = 0.51

# Create dataframe of parameters required for Houle 2015 analysis
LR_table = full_sap[
    ["site", "tap_id", "date_from", "date_to", "cum_frthw", "cumGDD", "sap_binary"]
]
LR_table = LR_table.rename(
    columns={"cum_frthw": "F", "cumGDD": "G", "sap_binary": "Y"}
)

LR_table["F2"] = LR_table.F ** 2  # Add F**2 column
LR_table["bias"] = 1  # Add bias column


# Calculations to generate weekly sapflow predictions based on model by Houle et al., 2015
LR_table.loc[:, "P"] = LR_table[["bias", "F", "F2", "G"]] @ (houle_coeff)
LR_table.loc[:, "S"] = 1 / (1 + np.exp(-LR_table["P"])
)  # Add intermediate sigmoid output column 'S'
LR_table.loc[:, "Y_hat"] = (LR_table["S"] > houle_thresh).astype(
    "int"
)  # Threshold determined by Houle 2015


LR_table["jd"] = LR_table["date_to"].dt.dayofyear
LR_table["year"] = LR_table["date_to"].dt.year

# Add columns to count number of true positives, false positives, false negatives, and true negatives
LR_table["tp"] = LR_table.apply(
    lambda x: 1 if (x.Y == 1 and x.Y_hat == 1) else 0, axis=1
)
LR_table["fp"] = LR_table.apply(
    lambda x: 1 if (x.Y == 0 and x.Y_hat == 1) else 0, axis=1
)
LR_table["fn"] = LR_table.apply(
    lambda x: 1 if (x.Y == 1 and x.Y_hat == 0) else 0, axis=1
)
LR_table["tn"] = LR_table.apply(
    lambda x: 1 if (x.Y == 0 and x.Y_hat == 0) else 0, axis=1
)


LR_table.head()


# Calculate summary of performance of Houle et al., 2015 model in terms of precision of prediction for weeks with sap flow (`precision_1`) 
# and prediction of weeks without sap flow (`precision_0`).
LR_summary = (
    LR_table[["site", "tap_id", "year", "tn", "fp", "fn", "tp"]]
    .groupby(["tap_id", "year", "site"])
    .sum()
)
LR_summary["precision_1"] = LR_summary.tp / (LR_summary.tp + LR_summary.fn)
LR_summary["precision_0"] = LR_summary.tn / (LR_summary.tn + LR_summary.fp)
LR_summary = LR_summary.reset_index()
LR_summary = LR_summary.merge(
    location.reset_index()[["site", "short_name", "state_province"]],
    on="site",
    how="left",
)
LR_summary["loc"] = LR_summary.short_name + ", " + LR_summary.state_province


LR_summary.head()


# Creates sample plot demonstrating output of sigmoid function over the course of a year.

sample_sigmoid = LR_table[
    (LR_table.site == "QC")
    & (LR_table.date_to.dt.year == 2015)
    & (LR_table.tap_id == "QC1A")
]

sample_sigmoid = sample_sigmoid.rename(
    columns={
        "tn": "No sap observed; no sap predicted by LR (tn)",
        "fp": "No sap observed; sap predicted by LR (fp)",
        "fn": "Sap observed; no sap predicted by LR (fn)",
        "tp": "Sap observed; sap predicted by LR (tp)",
    }
)

sample_sigmoid = (
    sample_sigmoid.reset_index()
    .melt(
        id_vars="index",
        value_vars=[
            "Sap observed; sap predicted by LR (tp)",
            "Sap observed; no sap predicted by LR (fn)",
            "No sap observed; no sap predicted by LR (tn)",
            "No sap observed; sap predicted by LR (fp)",
        ],
    )
    .merge(
        sample_sigmoid.reset_index()[["index", "S", "P", "Y", "Y_hat", "jd"]],
        on="index",
    )
)

sample_sigmoid = sample_sigmoid[sample_sigmoid.value == 1]


domain_pred = [
    "Sap observed; sap predicted by LR (tp)",
    "Sap observed; no sap predicted by LR (fn)",
    "No sap observed; no sap predicted by LR (tn)",
    "No sap observed; sap predicted by LR (fp)",
]

sigmoid_plt = (
    alt.Chart(sample_sigmoid)
    .mark_point(size=100, stroke=None, opacity=0.8)
    .encode(
        alt.X("jd", title="Day of year", scale=alt.Scale(domain=[60, 150])),
        alt.Y("S", title="Sigmoid function output"),
        fill=alt.Fill(
            "variable",
            title="LR Result",
            scale=alt.Scale(domain=domain_pred, range=["green", "red", "green", "red"]),
        ),
        shape=alt.Shape(
            "variable",
            scale=alt.Scale(
                domain=domain_pred, range=["cross", "cross", "circle", "circle"]
            ),
        ),
    )
    .interactive()
)


sigmoid_plt = sigmoid_plt + alt.Chart(pd.DataFrame({"ht": [houle_thresh]})).mark_rule(
    color="black",
    strokeDash=[10,10]
).encode(y="ht")

sigmoid_plt.properties(width=800, height=500).configure_legend(labelLimit=0)


# Generate plots showing prediction precision by site (red lines for comparison to precision reported by Houle et al., 2015)

houle_means = pd.DataFrame({"prec_1": [0.83], "prec_0": [0.95]})

# Create plot of precision for weeks with sap production
prec_1_plt = (
    alt.Chart(LR_summary)
    .mark_boxplot()
    .encode(
        y=alt.Y("precision_1", title="Precision of prediction of weeks with sap flow"),
        x=alt.X("loc", title="Site Location"),
        #         color = alt.Color("loc", legend=None)
    )
)

prec_1_plt = prec_1_plt + alt.Chart(houle_means).mark_rule(
    color="red", strokeDash=[10, 10]
).encode(y="prec_1")

# Create plot of precision for weeks without sap production
prec_0_plt = (
    alt.Chart(LR_summary)
    .mark_boxplot()
    .encode(
        y=alt.Y(
            "precision_0", title="Precision of prediction of weeks without sap flow"
        ),
        x=alt.X("loc", title="Site Location"),
        #         color = alt.Color("loc", legend=None)
    )
)
prec_0_plt = prec_0_plt + alt.Chart(houle_means).mark_rule(
    color="red", strokeDash=[10, 10]
).encode(y="prec_0")


width = 500


prec_1_plt.properties(width=width)


prec_0_plt.properties(width=width)


log_regs = dict()
site_spec_reg_prec = pd.DataFrame(columns=LR_table.site.unique().tolist())
test_list = []  # Create list of all test data

for site in LR_table.site.unique():
    X = LR_table[LR_table.site == site][["F", "F2", "G"]]
    y = np.ravel(LR_table[LR_table.site == site][["Y"]])

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        train_size=0.8,
        random_state=123,
        stratify=LR_table[LR_table.site == site].tap_id,
    )
    log_regs[site] = LogisticRegression()
    log_regs[site].fit(X_train, y_train)
    site_spec_reg_prec[site] = [precision_score(y_test, log_regs[site].predict(X_test))]
    
    # Populate list of all training data
    test_list += X_test.index.to_list()


for site in LR_table.site.unique():
    LR_table.loc[LR_table.site==site,['_','Y_logreg']] = log_regs[site].predict_proba(LR_table.loc[LR_table.site==site, ["F", "F2", "G"]])

# Use same regression threshold as Houle et al. (2015)
LR_table.loc[:,'Y_logreg'] = (LR_table.loc[:,'Y_logreg']>houle_thresh).astype('int')
LR_table.drop(columns=['_'],inplace=True)


# Add columns to count number of true positives, false positives, false negatives, and true negatives
LR_table["tp_lr"] = LR_table.apply(
    lambda x: 1 if (x.Y == 1 and x.Y_logreg == 1) else 0, axis=1
)
LR_table["fp_lr"] = LR_table.apply(
    lambda x: 1 if (x.Y == 0 and x.Y_logreg == 1) else 0, axis=1
)
LR_table["fn_lr"] = LR_table.apply(
    lambda x: 1 if (x.Y == 1 and x.Y_logreg == 0) else 0, axis=1
)
LR_table["tn_lr"] = LR_table.apply(
    lambda x: 1 if (x.Y == 0 and x.Y_logreg == 0) else 0, axis=1
)


# Calculate summary of performance of Houle et al., 2015 model in terms of precision of prediction for weeks with sap flow (`precision_1`) 
# and prediction of weeks without sap flow (`precision_0`).  Only test data is included in the precision values.
LR_summary.loc[:,["tn_lr", "fp_lr", "fn_lr", "tp_lr"]] = (
    LR_table.loc[test_list,["site", "tap_id", "year", "tn_lr", "fp_lr", "fn_lr", "tp_lr"]]
    .groupby(["tap_id", "year", "site"])
    .sum()
).reset_index()
LR_summary["precision_1_lr"] = LR_summary.tp_lr / (LR_summary.tp_lr + LR_summary.fn_lr)
LR_summary["precision_0_lr"] = LR_summary.tn_lr / (LR_summary.tn_lr + LR_summary.fp_lr)


# Generate plots showing prediction precision by site (red lines for comparison to precision reported by Houle et al., 2015

LR_prec = (
    LR_summary.reset_index()
    .melt(
        id_vars="index",
        value_vars=["precision_1", "precision_1_lr", "precision_0", "precision_0_lr"],
        value_name="prec_model",
    )
    .merge(LR_summary.reset_index(), on="index")
)

LR_prec["model"] = np.where(
    (LR_prec.variable == "precision_1") | (LR_prec.variable == "precision_0"),
    "Houle",
    "Site Specific",
)

prec_1_plt_lr = (
    alt.Chart(
        LR_prec[
            (LR_prec["variable"] == "precision_1")
            | (LR_prec["variable"] == "precision_1_lr")
        ]
    )
    .mark_rule(color="red")
    .encode(y="houle")
    .mark_boxplot()
    .encode(
        y=alt.Y("prec_model", title="Precision of prediction of weeks with sap flow"),
        x=alt.X("model", title="Model"),
        color=alt.Color("model", title="Model"),
        column=alt.Column("site", title="Site"),
    )
)

prec_0_plt_lr = (
    alt.Chart(
        LR_prec[
            (LR_prec["variable"] == "precision_0")
            | (LR_prec["variable"] == "precision_0_lr")
        ]
    )
    .mark_boxplot()
    .encode(
        y=alt.Y(
            "prec_model", title="Precision of prediction of weeks with no sap flow"
        ),
        x=alt.X("model", title="Model"),
        color=alt.Color("model", title="Model"),
        column=alt.Column("site", title="Site"),
    )
)


width = 100


prec_1_plt_lr.properties(width=width)


prec_0_plt_lr.properties(width=width)


models = ["Houle on Stinson Data", "Site Specific on Stinson Data", "Houle Original"]
precision_type = ["With Sap", "Without"]

column_index = pd.MultiIndex.from_product(
    [models, precision_type], names=["Model:", "Precision for weeks:"]
)
overall_prec = pd.DataFrame(
    columns=column_index,
    index=LR_table.site.unique().tolist(),
)
overall_prec.index.name = "Site"

overall_prec.loc[:, ("Houle on Stinson Data", "With Sap")] = (
    LR_summary.groupby(["site"]).mean().precision_1
)
overall_prec.loc[:, ("Houle on Stinson Data", "Without")] = (
    LR_summary.groupby(["site"]).mean().precision_0
)
overall_prec.loc[:, ("Site Specific on Stinson Data", "With Sap")] = (
    LR_summary.groupby(["site"]).mean().precision_1_lr
)
overall_prec.loc[:, ("Site Specific on Stinson Data", "Without")] = (
    LR_summary.groupby(["site"]).mean().precision_0_lr
)
overall_prec.loc[
    :, [("Houle Original", "With Sap"), ("Houle Original", "Without")]
] = ""

overall_prec.loc["Overall", :] = [
    LR_summary.mean().precision_1,
    LR_summary.mean().precision_0,
    LR_summary.mean().precision_1_lr,
    LR_summary.mean().precision_0_lr,
    0.83,
    0.95,
]


overall_prec.round(decimals=2)

	tap_id	date_from	date_to	site	stn_id	weather_datetime
0	DOF1A	2013-12-26	2014-01-01	DOF	726116-94765	2014-01-01
1	DOF1A	2013-12-27	2014-01-02	DOF	726116-94765	2014-01-02
2	DOF1A	2013-12-28	2014-01-03	DOF	726116-94765	2014-01-03
3	DOF1A	2013-12-29	2014-01-04	DOF	726116-94765	2014-01-04
4	DOF1A	2013-12-30	2014-01-05	DOF	726116-94765	2014-01-05

	tap_id	year	site	tn	fp	fn	tp	precision_1	precision_0	short_name	state_province	loc
0	DOF10A	2014	DOF	265	62	21	17	0.447368	0.810398	Dartmouth	NH	Dartmouth, NH
1	DOF10A	2015	DOF	322	3	4	36	0.900000	0.990769	Dartmouth	NH	Dartmouth, NH
2	DOF10A	2016	DOF	290	28	32	16	0.333333	0.911950	Dartmouth	NH	Dartmouth, NH
3	DOF10A	2017	DOF	290	40	14	21	0.600000	0.878788	Dartmouth	NH	Dartmouth, NH
4	DOF10B	2014	DOF	265	68	21	11	0.343750	0.795796	Dartmouth	NH	Dartmouth, NH

Model:	Houle on Stinson Data		Site Specific on Stinson Data		Houle Original
Precision for weeks:	With Sap	Without	With Sap	Without	With Sap	Without
Site
DOF	0.54	0.89	0.75	0.98
DR	0.26	0.95	0.32	0.98
HF	0.71	0.91	0.74	0.97
INDU	0.76	0.93	0.77	0.95
QC	0.86	0.97	0.79	0.99
SMM	0.74	0.88	0.80	0.99
Overall	0.68	0.92	0.74	0.98	0.83	0.95

Sap flow prediction Ananlysis¶

Introduction¶

Data Preparation¶

Locations¶

Analysis Table Creation¶

Analysis¶

Logistic Regression Model¶

Logistic Regression Predictions¶

Precision of Predictions¶

Site Specific Logistic Regression¶

Results¶

Overall Precision¶

Model Comparsion¶