import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns

import weather
 plants

import importlib

importlib.reload(weather)

<module 'weather' from '/Users/oneoPk/Desktop/Classes/330/final-project/weather.py'>

importlib.reload(plants)

<module 'plants' from '/Users/oneoPk/Desktop/Classes/330/final-project/plants.py'>

# showing only 10 rows when trying to print the whole pandas dataset 
pd.options.display.max_rows = 15

import json  # need it for json.dumps
import altair as alt
from altair.vega import v3
from IPython.display import HTML

# Create the correct URLs for require.js to find the Javascript libraries
vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v3.SCHEMA_VERSION
vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
noext = "?noext"

paths = {
    'vega': vega_url + noext,
    'vega-lib': vega_lib_url + noext,
    'vega-lite': vega_lite_url + noext,
    'vega-embed': vega_embed_url + noext
}

workaround = """
requirejs.config({{
    baseUrl: 'https://cdn.jsdelivr.net/npm/',
    paths: {paths}
}});
"""

HTML("".join((
    "<script>",
    workaround.format(paths=json.dumps(paths)),
    "</script>",
    "This code block sets up embedded rendering in HTML."
)))

# Define the function for rendering
def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        """Render an altair chart directly via javascript.
        
        This is a workaround for functioning export to HTML.
        (It probably messes up other ways to export.) It will
        cache and autoincrement the ID suffixed with a
        number (e.g. vega-chart-1) so you don't have to deal
        with that.
        """
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay defined and keep track of the unique div Ids
    return wrapped


@add_autoincrement
def render(chart, id="vega-chart"):
    # This below is the javascript to make the chart directly using vegaEmbed
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vegaEmbed) {{
        const spec = {chart};     
        vegaEmbed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
    }});
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )

Effects of Climate on Botrytis Infection in Peonies¶

1. Weather Data¶

First, I am geting the data from the weather api for the appropriate months

# shorthand notation for the relelvant cities
AA = "Ann Arbor, MI"
MSK = "Minsk, Belarus"

# this are the date blocks to fetch
april     = ['04-15','04-30']
may       = ['05-01','05-31']
june      = ['06-01','06-30']
july      = ['07-01','07-31']
august    = ['08-01','08-31']
september = ['09-01','09-30']

def get_weather_data(city,months,years,file):
    # Request to API and Dumping data to CSV
    data = weather.get_weather_data(city,years,months)
    weather.dump_to_csv(file, days = data)

    # reading weather into pandas dataframe
    df = pd.read_csv(file)

    # coverting the date column into a 'datetime' column
    df.date = df.date.astype('datetime64[ns]')
    
    # making the column with average temperature
    df['avgtempC'] = (df['maxtempC'] + df['mintempC'])/2

    
    # returning the result
    return df

#### GETTING WEATHER FOR ANN ARBOR ####
city   = AA
months = [may,june,july,august]
years  = [2012,2013,2014,2015,2016,2017,2018]
file   = 'aa_weather.csv'

# getting weather data for each day of given month
aa_weather_df = get_weather_data(city,months,years,file)

# need to calculate averages for each year
aa_weather_mean_by_year_df = aa_weather_df.groupby(aa_weather_df.date.dt.year).mean()

#### GETTING WEATHER FOR MINSK ####
city   = MSK
months = [may,june,july,august]
years  = [2012,2013,2014,2015,2016,2017,2018]
file   = 'msk_weather.csv'

# getting weather data for each day of given month
msk_weather_df = get_weather_data(city,months,years,file)

# need to calculate averages for each year
msk_weather_mean_by_year_df = msk_weather_df.groupby(msk_weather_df.date.dt.year).mean()

brush = alt.selection(type='interval', encodings=['x'])

msk = alt.Chart(msk_weather_df).mark_line().encode(
    x=alt.X('date:T', scale={'domain': brush.ref()}),
    y=alt.Y('humidity:Q',scale = {'domain':(40, 100)})
).properties(
    width=600,
    height=150
).encode(color='city:N')


aa = alt.Chart(aa_weather_df).mark_line().encode(
    x=alt.X('date:T', scale={'domain': brush.ref()}),
    y=alt.Y('humidity:Q',scale = {'domain':(40, 100)})
).properties(
    width=600,
    height=150
).encode(color='city:N')

lower =(msk.properties(
    height=60,
    width=600
)+aa.add_selection(brush)).properties(title="Interractive mini-map. Select using cursor the areas you'd like to zoom into")

render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Humidity in Minsk, Bealrus and Ann Arbor, MI, USA').configure_mark(opacity=0.8))

msk = msk.encode(y=alt.Y('avgtempC:Q'))
aa = aa.encode(y=alt.Y('avgtempC:Q')) 

plot = alt.vconcat(msk,aa,msk+aa,lower).properties(title='Temperature in Minsk and Ann Arbor')
render(plot)

msk = msk.encode(y=alt.Y('cloudcover:Q',scale = {'domain':(-10, 110)}))
aa = aa.encode(y=alt.Y('cloudcover:Q',scale = {'domain':(-10, 110)}))

render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Cloudcover in Minsk, Bealrus and Ann Arbor, MI, USA'))

msk = msk.encode(y=alt.Y('precipMM:Q',scale = {'domain':(-10, 50)})).mark_line()
aa = aa.encode(y=alt.Y('precipMM:Q',scale = {'domain':(-10, 50)})).mark_line()

render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Precipitation in Minsk, Bealrus and Ann Arbor, MI, USA'))

2. Plants Data¶

Second, I am going to collect the plants' morphological properties into a pandas dataframe to combine with weather data

file_name = 'data/plants/msk_aa_15-18.xlsx'
sheet_nums = [4,5,6,7]
column_names = ["No",
                "genetic_Acc_No",
                "num",
                "Cultivar",
                "Serial_No",
                "ID",
                "Regidity",
                "Thickness",
                "Bush",
                "Early_Bot",
                "Bot",
                "Mildew",
                "date",
                "City"]

plants_aa_df = plants.read_excel(file_name, sheet_nums = sheet_nums, na_values=['NA','Na'], column_names=column_names)

plants_aa_df.tail()

3. Combining both¶

Finally, it is time to combine the weather data with the plants data

First, lets take a look at the weather data average by years

msk_weather_mean_by_year_df

aa_weather_mean_by_year_df

It looks like year is in a string format and is the only connection between two datasets
Going to create a new column year to match weather and plants on

plants_aa_df['year'] = plants_aa_df.date.apply(lambda x: str(x)[0:4])

plants_aa_df.sample(4)

both columns must be the same format

plants_aa_df.year = plants_aa_df.year.astype('str')
aa_weather_mean_by_year_df.index = aa_weather_mean_by_year_df.index.astype('str')

now we can combine two datasets

plants_weather_aa_df = plants_aa_df.merge(aa_weather_mean_by_year_df, left_on='year', right_on='date')

plants_weather_aa_df.head()

4. Combined Data Processing¶

I'd like to find the top plants that are most resistant to Botrytis (Bot)
Specifically I am looking for plant that have never gone above threshold in Bot column

# setting the cutoff threshold
threshold = 0.4

# grouping the plants by the name and looking for the maximum infection level within plant
max_bot_df = plants_weather_aa_df.groupby('Cultivar').Bot.max()

# finding most resistant based on the threshold
most_resistant = max_bot_df[max_bot_df < threshold]
print(most_resistant)
most_resistant_list = most_resistant.index

# filtering the original dataframe with all the plants to select only the most resistant
most_resistant_df = plants_weather_aa_df[ plants_weather_aa_df.Cultivar.isin(most_resistant_list) ]

Cultivar
Couronne d'Or            0.350
Do Tell                  0.100
Fortune Teller           0.150
Kansas                   0.150
La Perle                 0.200
Marie Lemoine            0.300
Polar Star               0.200
Richardson Perfection    0.075
Name: Bot, dtype: float64

let's take a look at what the most resistant look like

most_resistant_df['Cultivar'].value_counts()

Fortune Teller           4
Do Tell                  4
La Perle                 4
Marie Lemoine            4
Polar Star               4
Couronne d'Or            4
Richardson Perfection    3
Kansas                   3
Name: Cultivar, dtype: int64

removing Richardson Perfection and Kansas as it has only 1 record and might be not good data

most_resistant_df = most_resistant_df[most_resistant_df.Cultivar != 'Richardson Perfection']
most_resistant_df = most_resistant_df[most_resistant_df.Cultivar != 'Kansas']

most_resistant_df['Cultivar'].value_counts()

Fortune Teller    4
Do Tell           4
Marie Lemoine     4
La Perle          4
Couronne d'Or     4
Polar Star        4
Name: Cultivar, dtype: int64

Now, looking for plants that are least resistant [susceptible] to Botrytis (Bot)
Specifically I am looking for plant that have never gone lower than threshold in Bot column

# finding least resistant based on the threshold
least_resistant = max_bot_df[max_bot_df > 1.5]
print(least_resistant)
least_resistant_list = least_resistant.index

# filtering the original dataframe with all the plants to select only the most resistant
least_resistant_df = plants_weather_aa_df[ plants_weather_aa_df.Cultivar.isin(least_resistant_list) ]

Cultivar
Arcturus                2.00
Arlequin                2.00
Boule de Neige          2.00
Edulus Superba          2.00
La Rosiere              2.60
Madame de Verneville    2.00
Marie d'Hour            1.75
Monsieur Dupont         2.00
Monsieur Jules Elie     2.75
Octavie Demay           3.00
Name: Bot, dtype: float64

5. Visual Exploration¶

Defining a function that can display interractive scatterplots via Altair¶

def bot_interractive_scatterplots(source, title):
    brush = alt.selection(type='interval', resolve='global')

    base = alt.Chart(source).mark_circle(size = 125).encode(
        alt.X('Bot:Q'),
    #     color = 'Cultivar:N',
        color=alt.condition(brush, 'Cultivar', alt.ColorValue('gray'))
    #     shape = 'Cultivar:O',
    #     size = 'Bot:Q'
    ).add_selection(
        brush
    ).properties(
        width=250,
        height=250
    )

    # definign scale for y
    s_temp   = alt.Scale(domain=(20, 23))
    s_humid  = alt.Scale(domain=(69, 77))
    s_percip = alt.Scale(domain=(3, 5))
    s_cloud  = alt.Scale(domain=(30, 46))

    # plotting graphs next to each other
    return (base.encode(alt.Y('avgtempC',scale = s_temp))|\
    base.encode(alt.Y('humidity',scale = s_humid))|\
    base.encode(alt.Y('precipMM',scale = s_percip))|\
    base.encode(alt.Y('cloudcover',scale = s_cloud))).properties(title=title)

now using the function to plot Most Resistant Species

render(bot_interractive_scatterplots(most_resistant_df,'Botrytis and Climate relationship among Resistant Species'))

now same but for Less Resistant Species

render(bot_interractive_scatterplots(least_resistant_df,'Botrytis and Climate relationship among Least Resistant Species'))

source = most_resistant_df


brush = alt.selection(type='interval', resolve='global')

base = alt.Chart(source).mark_circle().encode(
    alt.Y('humidity:Q',scale=alt.Scale(domain=(69, 77),clamp=True)),
    size='Bot:Q',
    color=alt.condition(brush, 'Cultivar', alt.ColorValue('gray'))
).add_selection(
    brush
).properties(
    width=250,
    height=250
)

render(base.encode(x='Bot') | base.encode(x='Bush'))

source = least_resistant_df


base = alt.Chart(source).mark_circle().encode(
    alt.Y('avgtempC:Q',scale=alt.Scale(domain=(19, 23),clamp=True)),
    size='Bot:Q',
    color=alt.condition(brush, 'Cultivar:N', alt.ColorValue('gray'))
).add_selection(
    brush
).properties(
    width=250,
    height=250
)

render(base.encode(x='Bot') | base.encode(x='Bush')
)

Plotting the most resistant species to see if there might be correlation between climate and infections level

g = sns.PairGrid(most_resistant_df, vars=['Bot','avgtempC', 'precipMM', 'cloudcover','humidity'])
g.map(sns.regplot)

<seaborn.axisgrid.PairGrid at 0x11b98ab00>

Now same process but for least resistant

g = sns.PairGrid(least_resistant_df, vars=['Bot','avgtempC', 'precipMM', 'cloudcover','humidity'])
g.map(sns.regplot)

<seaborn.axisgrid.PairGrid at 0x1a241fc2e8>

df_temp = most_resistant_df

mr1 = sns.lmplot(x='avgtempC', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='humidity', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='precipMM', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='cloudcover', y='Bot', data=df_temp)

df_temp = least_resistant_df
mr1 = sns.lmplot(x='avgtempC', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='humidity', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='precipMM', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='cloudcover', y='Bot', data=df_temp)

This is the correlation for the most resistant cultivars¶

most_resistant_df[['avgtempC','humidity','precipMM','cloudcover','Bot']].corr(method = 'pearson')

This is the correlation for the least resistant cultivars¶

least_resistant_df[['avgtempC','humidity','precipMM','cloudcover','Bot']].corr(method = 'pearson')

	No	genetic_Acc_No	num	Cultivar	Serial_No	ID	Regidity	Thickness	Bush	Early_Bot	Bot	Mildew	date	City
28	53.0	53.1	53.2	Polar Star	377.0	154AB	4.25	1.041667	4.5	NaN	0.020	0.1	2018-07-26	Ann Arbor
29	54.0	54.1	54.2	Edulus Superba	97.0	42AB	4.15	0.880000	4.5	NaN	0.200	NaN	2018-07-26	Ann Arbor
30	106.0	106.1	106.2	Kansas	NaN	24CD	4.50	1.155000	4.5	NaN	0.150	1.5	2018-07-26	Ann Arbor
31	112.0	112.1	112.2	Richard Carvel	NaN	44EF	NaN	NaN	NaN	NaN	NaN	NaN	2018-07-26	Ann Arbor
32	113.0	113.1	113.2	Richardson Perfection	NaN	124CD	3.75	1.031667	3.5	NaN	0.075	NaN	2018-07-26	Ann Arbor

	maxtempC	mintempC	precipMM	humidity	cloudcover	avgtempC
date
2012	22.138211	11.617886	3.608943	72.853659	38.593496	16.878049
2013	23.544715	12.601626	2.954472	70.081301	32.739837	18.073171
2014	24.138211	12.024390	3.405691	67.837398	38.699187	18.081301
2015	23.146341	11.211382	1.502439	68.991870	32.617886	17.178862
2016	22.463415	12.373984	2.100813	74.723577	37.016260	17.418699
2017	20.756098	14.788618	3.184553	71.357724	36.707317	17.772358
2018	21.943089	16.666667	3.470732	75.642276	43.398374	19.304878

	maxtempC	mintempC	precipMM	humidity	cloudcover	avgtempC
date
2012	29.569106	19.634146	2.061789	64.422764	24.926829	24.601626
2013	25.073171	16.951220	2.753659	77.138211	37.113821	21.012195
2014	24.967480	16.471545	3.622764	82.170732	34.040650	20.719512
2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488
2016	26.455285	17.772358	3.247967	70.861789	44.146341	22.113821
2017	24.373984	16.219512	3.739837	72.032520	31.479675	20.296748
2018	25.186992	16.829268	4.651220	70.934959	42.341463	21.008130

	No	genetic_Acc_No	num	Cultivar	Serial_No	ID	Regidity	Thickness	Bush	Early_Bot	Bot	Mildew	date	City	year
14	39.0	39.1	39.2	La Rosiere	201.0	82AB	3.5	0.83	3.5	NaN	0.055	NaN	2018-07-26	Ann Arbor	2018
30	106.0	106.1	106.2	Kansas	NaN	24CD	NaN	NaN	NaN	NaN	NaN	NaN	2017-07-10	Ann Arbor	2017
17	42.0	42.1	42.2	Marie Crousse	81.0	34CD	4.0	0.82	4.5	NaN	0.200	NaN	2018-07-26	Ann Arbor	2018
17	42.0	42.1	42.2	Marie Crousse	81.0	34CD	3.0	NaN	3.0	NaN	0.150	NaN	2017-07-10	Ann Arbor	2017

	No	genetic_Acc_No	num	Cultivar	Serial_No	ID	Regidity	Thickness	Bush	Early_Bot	...	Mildew	date	City	year	maxtempC	mintempC	precipMM	humidity	cloudcover	avgtempC
0	25.0	25.1	25.2	Albert Crousse	391.0	161CD	3.125	7.285	3.00	NaN	...	0.0	2015-07-11	Ann Arbor	2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488
1	26.0	26.1	26.2	Arcturus	115.0	45AB	2.750	6.100	3.00	NaN	...	0.1	2015-07-11	Ann Arbor	2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488
2	27.0	27.1	27.2	Arlequin	73.0	33AB	3.000	7.250	4.00	NaN	...	0.0	2015-07-11	Ann Arbor	2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488
3	28.0	28.1	28.2	Augustin D'Hour = General MacMahon	383.0	155AB	3.500	8.450	4.25	NaN	...	0.0	2015-07-11	Ann Arbor	2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488
4	29.0	29.1	29.2	Boule de Neige	211.0	84CD	4.000	8.935	4.00	NaN	...	0.0	2015-07-11	Ann Arbor	2015	26.195122	17.365854	3.277236	75.560976	39.560976	21.780488

	avgtempC	humidity	precipMM	cloudcover	Bot
avgtempC	1.000000	0.197314	-0.556154	0.807600	0.520022
humidity	0.197314	1.000000	-0.471050	-0.221062	0.037895
precipMM	-0.556154	-0.471050	1.000000	0.024654	-0.472540
cloudcover	0.807600	-0.221062	0.024654	1.000000	0.312931
Bot	0.520022	0.037895	-0.472540	0.312931	1.000000