import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
import weather
plants
import importlib
importlib.reload(weather)
importlib.reload(plants)
# showing only 10 rows when trying to print the whole pandas dataset
pd.options.display.max_rows = 15
import json # need it for json.dumps
import altair as alt
from altair.vega import v3
from IPython.display import HTML
# Create the correct URLs for require.js to find the Javascript libraries
vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v3.SCHEMA_VERSION
vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
noext = "?noext"
paths = {
'vega': vega_url + noext,
'vega-lib': vega_lib_url + noext,
'vega-lite': vega_lite_url + noext,
'vega-embed': vega_embed_url + noext
}
workaround = """
requirejs.config({{
baseUrl: 'https://cdn.jsdelivr.net/npm/',
paths: {paths}
}});
"""
HTML("".join((
"<script>",
workaround.format(paths=json.dumps(paths)),
"</script>",
"This code block sets up embedded rendering in HTML."
)))
# Define the function for rendering
def add_autoincrement(render_func):
# Keep track of unique <div/> IDs
cache = {}
def wrapped(chart, id="vega-chart", autoincrement=True):
"""Render an altair chart directly via javascript.
This is a workaround for functioning export to HTML.
(It probably messes up other ways to export.) It will
cache and autoincrement the ID suffixed with a
number (e.g. vega-chart-1) so you don't have to deal
with that.
"""
if autoincrement:
if id in cache:
counter = 1 + cache[id]
cache[id] = counter
else:
cache[id] = 0
actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
else:
if id not in cache:
cache[id] = 0
actual_id = id
return render_func(chart, id=actual_id)
# Cache will stay defined and keep track of the unique div Ids
return wrapped
@add_autoincrement
def render(chart, id="vega-chart"):
# This below is the javascript to make the chart directly using vegaEmbed
chart_str = """
<div id="{id}"></div><script>
require(["vega-embed"], function(vegaEmbed) {{
const spec = {chart};
vegaEmbed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
}});
</script>
"""
return HTML(
chart_str.format(
id=id,
chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
)
)
First, I am geting the data from the weather api for the appropriate months
# shorthand notation for the relelvant cities
AA = "Ann Arbor, MI"
MSK = "Minsk, Belarus"
# this are the date blocks to fetch
april = ['04-15','04-30']
may = ['05-01','05-31']
june = ['06-01','06-30']
july = ['07-01','07-31']
august = ['08-01','08-31']
september = ['09-01','09-30']
def get_weather_data(city,months,years,file):
# Request to API and Dumping data to CSV
data = weather.get_weather_data(city,years,months)
weather.dump_to_csv(file, days = data)
# reading weather into pandas dataframe
df = pd.read_csv(file)
# coverting the date column into a 'datetime' column
df.date = df.date.astype('datetime64[ns]')
# making the column with average temperature
df['avgtempC'] = (df['maxtempC'] + df['mintempC'])/2
# returning the result
return df
#### GETTING WEATHER FOR ANN ARBOR ####
city = AA
months = [may,june,july,august]
years = [2012,2013,2014,2015,2016,2017,2018]
file = 'aa_weather.csv'
# getting weather data for each day of given month
aa_weather_df = get_weather_data(city,months,years,file)
# need to calculate averages for each year
aa_weather_mean_by_year_df = aa_weather_df.groupby(aa_weather_df.date.dt.year).mean()
#### GETTING WEATHER FOR MINSK ####
city = MSK
months = [may,june,july,august]
years = [2012,2013,2014,2015,2016,2017,2018]
file = 'msk_weather.csv'
# getting weather data for each day of given month
msk_weather_df = get_weather_data(city,months,years,file)
# need to calculate averages for each year
msk_weather_mean_by_year_df = msk_weather_df.groupby(msk_weather_df.date.dt.year).mean()
brush = alt.selection(type='interval', encodings=['x'])
msk = alt.Chart(msk_weather_df).mark_line().encode(
x=alt.X('date:T', scale={'domain': brush.ref()}),
y=alt.Y('humidity:Q',scale = {'domain':(40, 100)})
).properties(
width=600,
height=150
).encode(color='city:N')
aa = alt.Chart(aa_weather_df).mark_line().encode(
x=alt.X('date:T', scale={'domain': brush.ref()}),
y=alt.Y('humidity:Q',scale = {'domain':(40, 100)})
).properties(
width=600,
height=150
).encode(color='city:N')
lower =(msk.properties(
height=60,
width=600
)+aa.add_selection(brush)).properties(title="Interractive mini-map. Select using cursor the areas you'd like to zoom into")
render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Humidity in Minsk, Bealrus and Ann Arbor, MI, USA').configure_mark(opacity=0.8))
msk = msk.encode(y=alt.Y('avgtempC:Q'))
aa = aa.encode(y=alt.Y('avgtempC:Q'))
plot = alt.vconcat(msk,aa,msk+aa,lower).properties(title='Temperature in Minsk and Ann Arbor')
render(plot)
msk = msk.encode(y=alt.Y('cloudcover:Q',scale = {'domain':(-10, 110)}))
aa = aa.encode(y=alt.Y('cloudcover:Q',scale = {'domain':(-10, 110)}))
render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Cloudcover in Minsk, Bealrus and Ann Arbor, MI, USA'))
msk = msk.encode(y=alt.Y('precipMM:Q',scale = {'domain':(-10, 50)})).mark_line()
aa = aa.encode(y=alt.Y('precipMM:Q',scale = {'domain':(-10, 50)})).mark_line()
render(alt.vconcat(msk,aa,msk+aa,lower).properties(title='Precipitation in Minsk, Bealrus and Ann Arbor, MI, USA'))
Second, I am going to collect the plants' morphological properties into a pandas dataframe to combine with weather data
file_name = 'data/plants/msk_aa_15-18.xlsx'
sheet_nums = [4,5,6,7]
column_names = ["No",
"genetic_Acc_No",
"num",
"Cultivar",
"Serial_No",
"ID",
"Regidity",
"Thickness",
"Bush",
"Early_Bot",
"Bot",
"Mildew",
"date",
"City"]
plants_aa_df = plants.read_excel(file_name, sheet_nums = sheet_nums, na_values=['NA','Na'], column_names=column_names)
plants_aa_df.tail()
Finally, it is time to combine the weather data with the plants data
First, lets take a look at the weather data average by years
msk_weather_mean_by_year_df
aa_weather_mean_by_year_df
It looks like year is in a string format and is the only connection between two datasets
Going to create a new column year to match weather
and plants
on
plants_aa_df['year'] = plants_aa_df.date.apply(lambda x: str(x)[0:4])
plants_aa_df.sample(4)
both columns must be the same format
plants_aa_df.year = plants_aa_df.year.astype('str')
aa_weather_mean_by_year_df.index = aa_weather_mean_by_year_df.index.astype('str')
now we can combine two datasets
plants_weather_aa_df = plants_aa_df.merge(aa_weather_mean_by_year_df, left_on='year', right_on='date')
plants_weather_aa_df.head()
I'd like to find the top plants that are most resistant to Botrytis (Bot)
Specifically I am looking for plant that have never gone above threshold in Bot column
# setting the cutoff threshold
threshold = 0.4
# grouping the plants by the name and looking for the maximum infection level within plant
max_bot_df = plants_weather_aa_df.groupby('Cultivar').Bot.max()
# finding most resistant based on the threshold
most_resistant = max_bot_df[max_bot_df < threshold]
print(most_resistant)
most_resistant_list = most_resistant.index
# filtering the original dataframe with all the plants to select only the most resistant
most_resistant_df = plants_weather_aa_df[ plants_weather_aa_df.Cultivar.isin(most_resistant_list) ]
let's take a look at what the most resistant look like
most_resistant_df['Cultivar'].value_counts()
removing Richardson Perfection and Kansas as it has only 1 record and might be not good data
most_resistant_df = most_resistant_df[most_resistant_df.Cultivar != 'Richardson Perfection']
most_resistant_df = most_resistant_df[most_resistant_df.Cultivar != 'Kansas']
most_resistant_df['Cultivar'].value_counts()
Now, looking for plants that are least resistant [susceptible] to Botrytis (Bot)
Specifically I am looking for plant that have never gone lower than threshold in Bot column
# finding least resistant based on the threshold
least_resistant = max_bot_df[max_bot_df > 1.5]
print(least_resistant)
least_resistant_list = least_resistant.index
# filtering the original dataframe with all the plants to select only the most resistant
least_resistant_df = plants_weather_aa_df[ plants_weather_aa_df.Cultivar.isin(least_resistant_list) ]
def bot_interractive_scatterplots(source, title):
brush = alt.selection(type='interval', resolve='global')
base = alt.Chart(source).mark_circle(size = 125).encode(
alt.X('Bot:Q'),
# color = 'Cultivar:N',
color=alt.condition(brush, 'Cultivar', alt.ColorValue('gray'))
# shape = 'Cultivar:O',
# size = 'Bot:Q'
).add_selection(
brush
).properties(
width=250,
height=250
)
# definign scale for y
s_temp = alt.Scale(domain=(20, 23))
s_humid = alt.Scale(domain=(69, 77))
s_percip = alt.Scale(domain=(3, 5))
s_cloud = alt.Scale(domain=(30, 46))
# plotting graphs next to each other
return (base.encode(alt.Y('avgtempC',scale = s_temp))|\
base.encode(alt.Y('humidity',scale = s_humid))|\
base.encode(alt.Y('precipMM',scale = s_percip))|\
base.encode(alt.Y('cloudcover',scale = s_cloud))).properties(title=title)
now using the function to plot Most Resistant Species
render(bot_interractive_scatterplots(most_resistant_df,'Botrytis and Climate relationship among Resistant Species'))
now same but for Less Resistant Species
render(bot_interractive_scatterplots(least_resistant_df,'Botrytis and Climate relationship among Least Resistant Species'))
source = most_resistant_df
brush = alt.selection(type='interval', resolve='global')
base = alt.Chart(source).mark_circle().encode(
alt.Y('humidity:Q',scale=alt.Scale(domain=(69, 77),clamp=True)),
size='Bot:Q',
color=alt.condition(brush, 'Cultivar', alt.ColorValue('gray'))
).add_selection(
brush
).properties(
width=250,
height=250
)
render(base.encode(x='Bot') | base.encode(x='Bush'))
source = least_resistant_df
base = alt.Chart(source).mark_circle().encode(
alt.Y('avgtempC:Q',scale=alt.Scale(domain=(19, 23),clamp=True)),
size='Bot:Q',
color=alt.condition(brush, 'Cultivar:N', alt.ColorValue('gray'))
).add_selection(
brush
).properties(
width=250,
height=250
)
render(base.encode(x='Bot') | base.encode(x='Bush')
)
Plotting the most resistant species to see if there might be correlation between climate and infections level
g = sns.PairGrid(most_resistant_df, vars=['Bot','avgtempC', 'precipMM', 'cloudcover','humidity'])
g.map(sns.regplot)
Now same process but for least resistant
g = sns.PairGrid(least_resistant_df, vars=['Bot','avgtempC', 'precipMM', 'cloudcover','humidity'])
g.map(sns.regplot)
df_temp = most_resistant_df
mr1 = sns.lmplot(x='avgtempC', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='humidity', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='precipMM', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='cloudcover', y='Bot', data=df_temp)
df_temp = least_resistant_df
mr1 = sns.lmplot(x='avgtempC', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='humidity', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='precipMM', y='Bot', data=df_temp)
mr2 = sns.lmplot(x='cloudcover', y='Bot', data=df_temp)
most_resistant_df[['avgtempC','humidity','precipMM','cloudcover','Bot']].corr(method = 'pearson')
least_resistant_df[['avgtempC','humidity','precipMM','cloudcover','Bot']].corr(method = 'pearson')