import pandas as pd
import seaborn as sb
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.ticker import FormatStrFormatter
from sklearn.model_selection import GroupShuffleSplit
pd.options.mode.chained_assignment = None
In this experiment we would like to interpolate satelite observations of glacier elevation with the help of other variables. The goal is to generate a maps of elevation change with uncertainties. These results will be used to estimate glacier contributions to sea level change and make both conservative and extreme predictions of future sea level. Capturing the tail-end of the glacier change distribution is therefore important.
The data has approximately 5 x 10^5 data points $\text{N}$, 7 input dimensions $\text{D}_{\text{in}}$. This is on the larger side of dataset size tractable with an exact GP. An alternative could be to train a Convolutional Conditional Neural Process (see deepsensor) However, we will see that it is still possible to apply GPs in this case.
# Load the data
df = pd.read_csv('data/glacier_dataset_targets.csv', usecols=['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'dhdt_filt', 'x', 'y', 'date_min', 'date_max'])
df
elev | ocean_dist | slope | vel | aspect | x | y | dhdt_filt | date_max | date_min | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 397.11383 | 4008.621682 | 5.197093 | 35.433570 | 102.33453 | 98730.00 | -3292241.2 | -0.841598 | 737713.0 | 732235.0 |
1 | 402.80795 | 4355.272741 | 4.037823 | 60.513220 | 102.66707 | 98762.75 | -3291823.2 | -0.815259 | 737713.0 | 732235.0 |
2 | 407.51212 | 4806.639575 | 2.551501 | 57.453743 | 109.58238 | 98789.48 | -3291306.5 | -0.921331 | 737713.0 | 732235.0 |
3 | 414.10504 | 5224.893579 | 1.833211 | 53.611755 | 140.86260 | 98810.63 | -3290840.5 | -1.034803 | 737713.0 | 732235.0 |
4 | 428.24548 | 5681.412747 | 2.089108 | 49.410866 | 161.33139 | 98704.96 | -3290392.8 | -1.086033 | 737713.0 | 732235.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
497135 | 384.22247 | 1528.472804 | 5.664680 | 1.496897 | 302.40158 | 469215.40 | -765207.1 | -0.288429 | 737440.0 | 732113.0 |
497136 | 385.54993 | 1540.715409 | 5.596981 | 1.509064 | 301.34116 | 469229.25 | -765215.2 | -0.229011 | 737471.0 | 732113.0 |
497137 | 444.31805 | 1477.312500 | 3.302001 | 1.991713 | 211.50058 | 470123.72 | -765149.8 | -0.156538 | 737683.0 | 731900.0 |
497138 | 442.05338 | 1803.294730 | 2.370549 | 3.348576 | 124.18109 | 470654.30 | -765440.3 | -0.228664 | 737683.0 | 731900.0 |
497139 | 349.88220 | 1318.378132 | 8.018579 | 1.110003 | 321.30118 | 469027.75 | -765012.6 | -0.408836 | 737440.0 | 732266.0 |
497140 rows × 10 columns
In this step, we relate domain knowledge (e.g. glacier literature) to the problem at hand. Elevation change can be positive or negative. We know also that the data is the results of a linear regression applied to raw data collected over the period of several days using the satellites ICESAT and ICESAT-2. A possible source of noise are differences in the observation times as the satellite orbit over the study area. As the data is geophysical, points closer together should be more similar. The variables are defined as follows:
name | description |
---|---|
elev_change | 'filtered' elevation change in [m/yr] |
elev | topographic elevation from altimetry [m] |
ocean_dist | distance to ocean [m] |
slope | topographic surface slope [m per m] |
vel | surface velocity [m/yr] |
aspect | topographic aspect [degrees from north] |
x | cartesian x coodinate in epsg 3413 projeciton [m] |
y | cartesian y coodinate in epsg 3413 projeciton [m] |
We are looking at filling in gaps over spatial coordinates, the first thing to do is to plot the data as in the same format as the way we would like to output it. It seems like the area with the least amount of data is the South of Greenland. The validation and test set should be similar in distribution to the areas we actually want to predict. Based on this information, we suggest splitting the data along the tracks of the satellite. The data along the tracks should be more correlated to each other as they are observed at simaliar times and are processed together.
# Simple scatter plot of target variable
df.plot.scatter(x='x', y='y', c='dhdt_filt', figsize=(10,10), cmap='coolwarm', norm=colors.CenteredNorm(),)
plt.ticklabel_format(axis='both', style='sci', scilimits=(4,4))
# Split data into training, validation and test sets
df['group'] = df.groupby(['date_min', 'date_max']).cumcount() # group along tracks
## Split into training and evaluation set (70%/30%)
gss1 = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=42)
indices1 = gss1.split(X=df[['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'x', 'y']
].values, y=df['dhdt_filt'].values, groups=df.group.values)
for i in indices1:
(train_index, eval_index) = i
print(
f" Train: index={train_index}, group={df.group.values[train_index]}")
print(f" Test: index={eval_index}, group={df.group.values[eval_index]}")
train_df = df.iloc[train_index]
## Split into validation and test set (33%/67%)
eval_df = df.iloc[eval_index]
gss2 = GroupShuffleSplit(n_splits=1, train_size=.3333, random_state=42)
indices2 = gss2.split(X=eval_df[['elev', 'ocean_dist', 'slope', 'vel', 'aspect', 'dhdt_filt', 'x', 'y']].values,
y=eval_df['dhdt_filt'].values, groups=eval_df.group.values)
for i in indices2:
(val_index, test_index) = i
print(
f" Train: index={val_index}, group={eval_df.group.values[val_index]}")
print(
f" Test: index={test_index}, group={eval_df.group.values[test_index]}")
val_df = eval_df.iloc[val_index]
test_df = eval_df.iloc[test_index]
Train: index=[ 0 1 2 ... 497136 497138 497139], group=[ 0 1 2 ... 875 18721 27] Test: index=[ 4 6 7 ... 497128 497134 497137], group=[ 4 6 7 ... 1741 18719 18720] Train: index=[ 1 5 8 ... 150135 150136 150142], group=[ 6 6 6 ... 20379 10226 18719] Test: index=[ 0 2 3 ... 150140 150141 150143], group=[ 4 7 4 ... 20385 1741 18720]
We can now check the sets capture the distrubtion of the data accross the different input features. We plot scatter plots and pair plots to visually acertain this.
fig, ax = plt.subplots(1, 3, figsize=(20, 7))
ax[0].set_title('Training set')
sp0 = train_df.plot.scatter(x='x', y='y', color='grey', alpha=0.2, ax=ax[0])
ax[0].ticklabel_format(axis='both', style='sci', scilimits=(4,4))
ax[1].set_title('Validation set')
ax[1].ticklabel_format(axis='both', style='sci', scilimits=(4,4))
sp1 = val_df.plot.scatter(x='x', y='y', color='lightblue', alpha=0.2, ax=ax[1])
ax[2].set_title('Test set')
ax[2].ticklabel_format(axis='both', style='sci', scilimits=(4,4))
sp2 = test_df.plot.scatter(x='x', y='y', color='gold', alpha=0.2, ax=ax[2])
# Pairplot of all variables
train_df.loc[:, "set"] = 'train'
val_df.loc[:, ['set']] = 'val'
test_df.loc[:, ['set']] = 'test'
all_set_df = pd.concat([train_df, val_df, test_df])
grid = sb.pairplot(all_set_df[::1000].drop(columns=['group']), hue='set')
# Save the data
# train_df.to_csv('data/training_set.csv')
# val_df.to_csv('data/validation_set.csv')
# test_df.to_csv('data/test_set.csv')