import pandas as pd
import seaborn as sb
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.ticker import FormatStrFormatter
from sklearn.model_selection import GroupShuffleSplit


            
              pd.options.mode.chained_assignment = None


            
              # Load the data
df = pd.read_csv('data/glacier_dataset_targets.csv', usecols=['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'dhdt_filt', 'x', 'y', 'date_min', 'date_max'])

df


            
              # Simple scatter plot of target variable
df.plot.scatter(x='x', y='y', c='dhdt_filt', figsize=(10,10), cmap='coolwarm', norm=colors.CenteredNorm(),)
plt.ticklabel_format(axis='both', style='sci',  scilimits=(4,4))


            
              # Split data into training, validation and test sets

df['group'] = df.groupby(['date_min', 'date_max']).cumcount()  # group along tracks

## Split into training and evaluation set (70%/30%)
gss1 = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=42)
indices1 = gss1.split(X=df[['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'x', 'y']
                           ].values, y=df['dhdt_filt'].values, groups=df.group.values)
for i in indices1:
    (train_index, eval_index) = i
    print(
        f"  Train: index={train_index}, group={df.group.values[train_index]}")
    print(f"  Test:  index={eval_index}, group={df.group.values[eval_index]}")
train_df = df.iloc[train_index]

## Split into validation and test set (33%/67%)
eval_df = df.iloc[eval_index]
gss2 = GroupShuffleSplit(n_splits=1, train_size=.3333, random_state=42)
indices2 = gss2.split(X=eval_df[['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'dhdt_filt', 'x', 'y']].values,
                      y=eval_df['dhdt_filt'].values, groups=eval_df.group.values)
for i in indices2:
    (val_index, test_index) = i
    print(
        f"  Train: index={val_index}, group={eval_df.group.values[val_index]}")
    print(
        f"  Test:  index={test_index}, group={eval_df.group.values[test_index]}")
val_df = eval_df.iloc[val_index]
test_df = eval_df.iloc[test_index]

  Train: index=[     0      1      2 ... 497136 497138 497139], group=[    0     1     2 ...   875 18721    27]
  Test:  index=[     4      6      7 ... 497128 497134 497137], group=[    4     6     7 ...  1741 18719 18720]
  Train: index=[     1      5      8 ... 150135 150136 150142], group=[    6     6     6 ... 20379 10226 18719]
  Test:  index=[     0      2      3 ... 150140 150141 150143], group=[    4     7     4 ... 20385  1741 18720]


            
              fig, ax = plt.subplots(1, 3, figsize=(20, 7))

ax[0].set_title('Training set')
sp0 = train_df.plot.scatter(x='x', y='y', color='grey', alpha=0.2, ax=ax[0])
ax[0].ticklabel_format(axis='both', style='sci',  scilimits=(4,4))

ax[1].set_title('Validation set')
ax[1].ticklabel_format(axis='both', style='sci',  scilimits=(4,4))
sp1 = val_df.plot.scatter(x='x', y='y', color='lightblue',  alpha=0.2, ax=ax[1])

ax[2].set_title('Test set')
ax[2].ticklabel_format(axis='both', style='sci',  scilimits=(4,4))
sp2 = test_df.plot.scatter(x='x', y='y', color='gold', alpha=0.2, ax=ax[2])


            
              # Pairplot of all variables

train_df.loc[:, "set"] = 'train'   
val_df.loc[:, ['set']] = 'val'
test_df.loc[:, ['set']] = 'test'
all_set_df = pd.concat([train_df, val_df, test_df])

grid = sb.pairplot(all_set_df[::1000].drop(columns=['group']), hue='set')


            
              # Save the data
# train_df.to_csv('data/training_set.csv')
# val_df.to_csv('data/validation_set.csv')
# test_df.to_csv('data/test_set.csv')

	elev	ocean_dist	slope	vel	aspect	x	y	dhdt_filt	date_max	date_min
0	397.11383	4008.621682	5.197093	35.433570	102.33453	98730.00	-3292241.2	-0.841598	737713.0	732235.0
1	402.80795	4355.272741	4.037823	60.513220	102.66707	98762.75	-3291823.2	-0.815259	737713.0	732235.0
2	407.51212	4806.639575	2.551501	57.453743	109.58238	98789.48	-3291306.5	-0.921331	737713.0	732235.0
3	414.10504	5224.893579	1.833211	53.611755	140.86260	98810.63	-3290840.5	-1.034803	737713.0	732235.0
4	428.24548	5681.412747	2.089108	49.410866	161.33139	98704.96	-3290392.8	-1.086033	737713.0	732235.0
...	...	...	...	...	...	...	...	...	...	...
497135	384.22247	1528.472804	5.664680	1.496897	302.40158	469215.40	-765207.1	-0.288429	737440.0	732113.0
497136	385.54993	1540.715409	5.596981	1.509064	301.34116	469229.25	-765215.2	-0.229011	737471.0	732113.0
497137	444.31805	1477.312500	3.302001	1.991713	211.50058	470123.72	-765149.8	-0.156538	737683.0	731900.0
497138	442.05338	1803.294730	2.370549	3.348576	124.18109	470654.30	-765440.3	-0.228664	737683.0	731900.0
497139	349.88220	1318.378132	8.018579	1.110003	321.30118	469027.75	-765012.6	-0.408836	737440.0	732266.0

name	description
elev_change	'filtered' elevation change in [m/yr]
elev	topographic elevation from altimetry [m]
ocean_dist	distance to ocean [m]
slope	topographic surface slope [m per m]
vel	surface velocity [m/yr]
aspect	topographic aspect [degrees from north]
x	cartesian x coodinate in epsg 3413 projeciton [m]
y	cartesian y coodinate in epsg 3413 projeciton [m]

Glacier elevation change¶

Part I : Data Exploration (Steps 1 to 4)¶

Step 1: Problem definition¶

Step 2: Initial data exploration¶

Step 3: Domain expertise¶

Step 4: Training, validation and test sets¶