Making it easier to navigate and clean station data
Project description
Documentation
https://filterstations.netlify.app/
Water Level Pipeline
- A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level
- All begins with the coordinates of the gauging station(location of the monitoring sensor)
import os
from pathlib import Path
import haversine as hs
import pandas as pd
import numpy as np
import datetime
import statsmodels.api as sm
from matplotlib.dates import DateFormatter
import matplotlib.pyplot as plt
# config_path
config_path = os.path.join(Path(os.getcwd()).parent.absolute(), 'config.json')
from filter_stations import retreive_data, Interactive_maps, Filter
import json
# Authentication
with open(config_path) as f:
conf = json.load(f)
apiKey = conf['apiKey']
apiSecret = conf['apiSecret']
fs = retreive_data(apiKey, apiSecret)
# given the radius and the longitude and latitude of the gauging station, return the stations within
def stations_within_radius(radius, latitude, longitude, df=False):
stations = fs.get_stations_info()
stations['distance'] = stations.apply(lambda row: hs.haversine((latitude, longitude), (row['location.latitude'], row['location.longitude'])), axis=1)
infostations = stations[['code', 'location.latitude','location.longitude', 'distance']].sort_values('distance')
if df:
return infostations[infostations['distance'] <= radius]
else:
return infostations[infostations['distance'] <= radius].code.values
ewaso = stations_within_radius(100, -0.406689, 36.96301)
ewaso
API request: services/assets/v2/stations
array(['TA00283', 'TA00378', 'TA00754', 'TA00074', 'TA00196', 'TA00073',
'TA00056', 'TA00029', 'TA00416', 'TA00719', 'TA00258', 'TA00622',
'TA00028', 'TA00414', 'TA00190', 'TA00078', 'TA00024', 'TA00080',
'TA00166', 'TA00108', 'TA00026', 'TA00189', 'TA00250', 'TA00182',
'TA00715', 'TA00377', 'TA00027', 'TA00057', 'TA00134', 'TA00448',
'TA00774', 'TA00773', 'TA00772', 'TA00775', 'TA00771', 'TA00679',
'TA00770'], dtype=object)
The assumption here is one can have credential but not the data
- From the list of stations get the precipitation data with a certain data completeness check provided
- Additionally the start and end date if the data is not provided
- The default start date is the day the sensors were set up at DSAIL
- Chck the documentation on the types of variables available
def stations_data_check(stations_list, percentage=1, start_date=None, end_date=None, data=None, variables=['pr'], csv_file=None):
if data is None:
data = fs.multiple_measurements(stations_list, startDate=start_date, endDate=end_date, variables=variables, csv_file=csv_file)
# Check the percentage of missing data and return the stations with less than the percentage of missing data
data.index = data.index.astype('datetime64[ns]')
data = data.dropna(axis=1, thresh=int(len(data) * percentage))
data.to_csv(f'{csv_file}.csv')
return data
stations_df = stations_data_check(list(ewaso), start_date='2022-12-01', end_date='2022-12-31', variables=['pr'], csv_file='ewaso2.csv')
Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag
- The default lag is 3 days between a particular station and the gauging station
- The required format is a timeseries data
- Provide the column names for evaluation format = [Date, data]
- with the change in parameters one can choose above or below threshold
def stations_lag(weather_stations_df, gauging_stations_df, gauging_station_columns, date=None, lag=3, above=False, below=False):
# set the date as axis
# weather_station_df = weather_stations_df.set_index('Date')
# weather_stations_df.Date= weather_stations_df.Date.apply(pd.to_datetime,dayfirst = True)
# weather_stations_df = weather_stations_df.set_index('Date')
# get the starting date of the gauging station the first value
if date is None:
date = gauging_stations_df.loc[0, gauging_station_columns[0]]
start_date = datetime.datetime.strptime(date, "%d/%m/%Y")
end_date = start_date + datetime.timedelta(len(gauging_stations_df)-1)
# get the ddataframe from start date to end date
df_fit = weather_stations_df[start_date:end_date]
# get the water data list
water_list = list(gauging_stations_df[f'{gauging_station_columns[1]}'])
above_thresh_lag = dict()
below_thresh_lag = dict()
# get the lag for every column against the water data
for cols in df_fit.columns:
select_list = list(df_fit[cols])
coefficient_list = list(sm.tsa.stattools.ccf(select_list,water_list, adjusted=False))
a = np.argmax(coefficient_list)
b = coefficient_list[a]
if a > lag:
above_thresh_lag[cols] = {
'lag': a,
'coefficient': b,
'coefficient_list': coefficient_list,
'select_list': select_list,
'water_list' : water_list
}
else:
below_thresh_lag[cols] = {
'lag': a,
'coefficient': b,
'coefficient_list': coefficient_list,
'select_list': select_list,
'water_list' : water_list
}
if above and below:
return above_thresh_lag, below_thresh_lag
elif above:
return above_thresh_lag
elif below:
return below_thresh_lag
water_six = pd.read_csv('./water-level-data-ewaso/1E2020.csv')
water_six
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
time | water_level(m) | |
---|---|---|
0 | 12/05/2020 | 2.618646 |
1 | 13/05/2020 | 2.551392 |
2 | 14/05/2020 | 2.507711 |
3 | 15/05/2020 | 2.491130 |
4 | 16/05/2020 | 2.434761 |
... | ... | ... |
259 | 26/01/2021 | 0.947099 |
260 | 27/01/2021 | 0.929186 |
261 | 28/01/2021 | 0.911274 |
262 | 29/01/2021 | 0.910711 |
263 | 30/01/2021 | 0.939971 |
264 rows × 2 columns
lag_ = stations_lag(stations_df, water_six, ['time', 'water_level(m)'], lag=3,below=True)
lag_
Plotting
Provides visuals of the data
- An option to save the
- An option of choosing the dpi
- provide the startDate based on the water collection starting date
lag_[list(lag_.keys())[0]]['water_list']
import warnings
warnings. filterwarnings('ignore')
def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'):
start_date = datetime.datetime.strptime(date, "%d-%m-%Y")
end_date = start_date + datetime.timedelta(len(water_list)-1)
# weather_stations = weather_stations.set_index('Date')
df_plot = weather_stations[start_date:end_date]
df_plot = df_plot[threshold_list].reset_index()
df_plot.rename(columns={'index':'Date'}, inplace=True)
plt.rcParams['figure.figsize'] = (15, 9)
print('Begin plotting!')
for cols in df_plot.columns[1:]:
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel(f'Time', fontsize=24, weight='bold')
ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold')
ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0)
ax1.tick_params(axis='y', labelcolor=color, labelsize=24)
ax1.tick_params(axis='x')
ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold')
ax1.grid(color='gray', linestyle='--', linewidth=0.8)
ax1.set(facecolor="white")
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:red'
ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold')
ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4)
ax2.tick_params(axis='y', labelcolor=color, labelsize=24)
ax2.set(facecolor="white")
plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold')
date_form = DateFormatter("%m-%y")
ax1.xaxis.set_major_formatter(date_form)
fig.tight_layout()
if save:
fig.savefig(f'{cols}.png', dpi=dpi)
plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020')
Begin plotting!
Format to get the stations maetadata
def filter_metadata(lag_keys):
captured_list = [i.split('_')[0] for i in list(lag_keys)]
return fs.get_stations_info(multipleStations=captured_list)
filter_metadata(list(lag_.keys()))
API request: services/assets/v2/stations
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
code | status | installationdate | elevationground | sensorinstallations | dataloggerinstallations | creatorid | created | updaterid | updated | ... | location.countrycode | location.zipcode | location.latitude | location.longitude | location.elevationmsl | location.note | location.creatorid | location.created | location.updaterid | location.updated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
26 | TA00028 | 1 | 2015-08-31T00:00:00Z | 9.0 | None | None | 2 | 2018-12-11T08:35:17.888233Z | 2 | 2018-12-11T08:35:17.888233Z | ... | KE | 0.055219 | 37.136747 | 2003.6 | {} | 2 | 2018-10-26T13:32:16.15537Z | 37 | 2022-06-30T11:11:50.27135Z | |
27 | TA00029 | 1 | 2015-09-02T00:00:00Z | 2.0 | None | None | 2 | 2018-12-11T08:36:19.30342Z | 2 | 2018-12-11T08:36:19.30342Z | ... | KE | -0.500776 | 36.587511 | 2545.8 | {} | 2 | 2018-10-26T13:33:31.451613Z | 37 | 2022-02-28T12:25:09.578242Z | |
53 | TA00057 | 1 | 2015-10-08T00:00:00Z | 2.0 | None | None | 2 | 2018-12-11T09:21:29.092833Z | 2 | 2018-12-11T09:21:29.092833Z | ... | KE | -1.253030 | 36.856487 | 1645.3 | {} | 2 | 2018-10-29T09:13:33.768613Z | 2 | 2022-07-26T07:34:06.603938Z | |
68 | TA00074 | 1 | 2015-11-19T00:00:00Z | 2.0 | None | None | 2 | 2018-12-11T09:38:25.742397Z | 2 | 2018-12-11T09:38:25.742397Z | ... | KE | -0.566080 | 37.074412 | 1726.8 | {} | 2 | 2018-10-29T10:35:28.49617Z | 2 | 2022-07-26T07:38:42.100985Z | |
74 | TA00080 | 1 | 2016-01-28T00:00:00Z | 2.0 | None | None | 2 | 2018-12-11T09:43:10.523398Z | 2 | 2018-12-11T09:43:10.523398Z | ... | KE | -1.087589 | 36.818402 | 1777.3 | {} | 2 | 2018-10-29T10:53:47.845042Z | 37 | 2022-02-28T13:07:04.709903Z | |
150 | TA00166 | 1 | 2017-05-11T00:00:00Z | 2.0 | None | None | 2 | 2018-12-12T08:29:28.10697Z | 2 | 2018-12-12T08:29:28.10697Z | ... | KE | -0.319508 | 37.659139 | 1404.0 | {} | 2 | 2018-11-10T08:47:37.949135Z | 2 | 2018-11-10T08:47:37.949135Z |
6 rows × 28 columns
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
filter_stations-0.3.13.tar.gz
(22.2 kB
view hashes)
Built Distribution
Close
Hashes for filter_stations-0.3.13-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 23c7351c7dd9a10b7f8ff2d274715ab263af78c85434a097e79bd84ec6ad4784 |
|
MD5 | 9f5eedfc4bfcdbe2a51743469d258bd0 |
|
BLAKE2b-256 | ed24ad02a6eb3f33aab818a7ff8babc8587c0aa3fd5ff961a27b478ff3306780 |