Skip to main content

Making it easier to navigate and clean station data

Project description

Documentation

https://filterstations.netlify.app/

Water Level Pipeline

  • A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level
  • All begins with the coordinates of the gauging station(location of the monitoring sensor)
import os
from pathlib import Path
import haversine as hs
import pandas as pd
import numpy as np
import datetime
import statsmodels.api as sm
from matplotlib.dates import DateFormatter
import matplotlib.pyplot as plt

# config_path
config_path = os.path.join(Path(os.getcwd()).parent.absolute(), 'config.json')
from filter_stations import retreive_data, Interactive_maps, Filter
import json
# Authentication
with open(config_path) as f:
    conf = json.load(f)

apiKey = conf['apiKey']
apiSecret = conf['apiSecret']
fs = retreive_data(apiKey, apiSecret)
# given the radius and the longitude and latitude of the gauging station, return the stations within
def stations_within_radius(radius, latitude, longitude, df=False):
    stations  = fs.get_stations_info()
    stations['distance'] = stations.apply(lambda row: hs.haversine((latitude, longitude), (row['location.latitude'], row['location.longitude'])), axis=1)
    infostations = stations[['code', 'location.latitude','location.longitude', 'distance']].sort_values('distance')
    if df:
        return infostations[infostations['distance'] <= radius]
    else:
        return infostations[infostations['distance'] <= radius].code.values
ewaso = stations_within_radius(100, -0.406689, 36.96301)
ewaso
API request: services/assets/v2/stations





array(['TA00283', 'TA00378', 'TA00754', 'TA00074', 'TA00196', 'TA00073',
       'TA00056', 'TA00029', 'TA00416', 'TA00719', 'TA00258', 'TA00622',
       'TA00028', 'TA00414', 'TA00190', 'TA00078', 'TA00024', 'TA00080',
       'TA00166', 'TA00108', 'TA00026', 'TA00189', 'TA00250', 'TA00182',
       'TA00715', 'TA00377', 'TA00027', 'TA00057', 'TA00134', 'TA00448',
       'TA00774', 'TA00773', 'TA00772', 'TA00775', 'TA00771', 'TA00679',
       'TA00770'], dtype=object)

The assumption here is one can have credential but not the data

  • From the list of stations get the precipitation data with a certain data completeness check provided
  • Additionally the start and end date if the data is not provided
  • The default start date is the day the sensors were set up at DSAIL
  • Chck the documentation on the types of variables available
def stations_data_check(stations_list, percentage=1, start_date=None, end_date=None, data=None, variables=['pr'], csv_file=None):
    if data is None:
        data = fs.multiple_measurements(stations_list, startDate=start_date, endDate=end_date, variables=variables, csv_file=csv_file)

    # Check the percentage of missing data and return the stations with less than the percentage of missing data
    data.index = data.index.astype('datetime64[ns]')
    data = data.dropna(axis=1, thresh=int(len(data) * percentage))
    data.to_csv(f'{csv_file}.csv')
    return data
stations_df = stations_data_check(list(ewaso), start_date='2022-12-01', end_date='2022-12-31', variables=['pr'], csv_file='ewaso2.csv')

Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag

  • The default lag is 3 days between a particular station and the gauging station
  • The required format is a timeseries data
  • Provide the column names for evaluation format = [Date, data]
  • with the change in parameters one can choose above or below threshold
def stations_lag(weather_stations_df, gauging_stations_df, gauging_station_columns, date=None, lag=3, above=False, below=False):
    
    
    # set the date as axis
    # weather_station_df = weather_stations_df.set_index('Date')
    # weather_stations_df.Date= weather_stations_df.Date.apply(pd.to_datetime,dayfirst = True)
    # weather_stations_df = weather_stations_df.set_index('Date')
    # get the starting date of the gauging station the first value
    if date is None:
        date = gauging_stations_df.loc[0, gauging_station_columns[0]]
    start_date = datetime.datetime.strptime(date, "%d/%m/%Y")
    end_date = start_date + datetime.timedelta(len(gauging_stations_df)-1)
    # get the ddataframe from start date to end date
    df_fit = weather_stations_df[start_date:end_date]
    # get the water data list
    water_list = list(gauging_stations_df[f'{gauging_station_columns[1]}'])
    above_thresh_lag = dict()
    below_thresh_lag = dict()
    # get the lag for every column against the water data 
    for cols in df_fit.columns:
        select_list = list(df_fit[cols])
        coefficient_list = list(sm.tsa.stattools.ccf(select_list,water_list, adjusted=False))
        a = np.argmax(coefficient_list)
        b = coefficient_list[a]
        if a > lag:
            above_thresh_lag[cols] = {
                'lag': a,
                'coefficient': b,
                'coefficient_list': coefficient_list,
                'select_list': select_list,
                'water_list' : water_list
            }
        else:
            below_thresh_lag[cols] = {
                'lag': a,
                'coefficient': b,
                'coefficient_list': coefficient_list,
                'select_list': select_list,
                'water_list' : water_list
            }
    if above and below:
        return above_thresh_lag, below_thresh_lag
    elif above:
        return above_thresh_lag
    elif below:
        return below_thresh_lag
water_six = pd.read_csv('./water-level-data-ewaso/1E2020.csv')
water_six
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
time water_level(m)
0 12/05/2020 2.618646
1 13/05/2020 2.551392
2 14/05/2020 2.507711
3 15/05/2020 2.491130
4 16/05/2020 2.434761
... ... ...
259 26/01/2021 0.947099
260 27/01/2021 0.929186
261 28/01/2021 0.911274
262 29/01/2021 0.910711
263 30/01/2021 0.939971

264 rows × 2 columns

lag_ = stations_lag(stations_df, water_six, ['time', 'water_level(m)'], lag=3,below=True)
lag_

Plotting

Provides visuals of the data

  • An option to save the
  • An option of choosing the dpi
  • provide the startDate based on the water collection starting date
lag_[list(lag_.keys())[0]]['water_list']
import warnings
warnings. filterwarnings('ignore')
def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'):
    start_date = datetime.datetime.strptime(date, "%d-%m-%Y")
    end_date = start_date + datetime.timedelta(len(water_list)-1)
    # weather_stations = weather_stations.set_index('Date')
    df_plot = weather_stations[start_date:end_date]
    df_plot = df_plot[threshold_list].reset_index()
    df_plot.rename(columns={'index':'Date'}, inplace=True)
    
    
    plt.rcParams['figure.figsize'] = (15, 9)
    print('Begin plotting!')
    
    for cols in df_plot.columns[1:]:
        fig, ax1 = plt.subplots()
        color = 'tab:blue'
        ax1.set_xlabel(f'Time', fontsize=24, weight='bold')
        ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold')
        ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0)
        ax1.tick_params(axis='y', labelcolor=color, labelsize=24)
        ax1.tick_params(axis='x')
        ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold')
        ax1.grid(color='gray', linestyle='--', linewidth=0.8)
        ax1.set(facecolor="white")
        ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

        color = 'tab:red'
        ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold')
        ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4)
        ax2.tick_params(axis='y', labelcolor=color, labelsize=24)
        ax2.set(facecolor="white")
        plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold')

        date_form = DateFormatter("%m-%y")
        ax1.xaxis.set_major_formatter(date_form)
        fig.tight_layout()

        if save:
            fig.savefig(f'{cols}.png', dpi=dpi)
plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020')
Begin plotting!

water_level_pipeline_16_1

water_level_pipeline_16_2

water_level_pipeline_16_3

water_level_pipeline_16_4

water_level_pipeline_16_5

water_level_pipeline_16_6

water_level_pipeline_16_7

water_level_pipeline_16_8

water_level_pipeline_16_9

Format to get the stations maetadata

def filter_metadata(lag_keys):
    captured_list = [i.split('_')[0] for i in list(lag_keys)]
    return fs.get_stations_info(multipleStations=captured_list)
filter_metadata(list(lag_.keys()))
API request: services/assets/v2/stations
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
code status installationdate elevationground sensorinstallations dataloggerinstallations creatorid created updaterid updated ... location.countrycode location.zipcode location.latitude location.longitude location.elevationmsl location.note location.creatorid location.created location.updaterid location.updated
26 TA00028 1 2015-08-31T00:00:00Z 9.0 None None 2 2018-12-11T08:35:17.888233Z 2 2018-12-11T08:35:17.888233Z ... KE 0.055219 37.136747 2003.6 {} 2 2018-10-26T13:32:16.15537Z 37 2022-06-30T11:11:50.27135Z
27 TA00029 1 2015-09-02T00:00:00Z 2.0 None None 2 2018-12-11T08:36:19.30342Z 2 2018-12-11T08:36:19.30342Z ... KE -0.500776 36.587511 2545.8 {} 2 2018-10-26T13:33:31.451613Z 37 2022-02-28T12:25:09.578242Z
53 TA00057 1 2015-10-08T00:00:00Z 2.0 None None 2 2018-12-11T09:21:29.092833Z 2 2018-12-11T09:21:29.092833Z ... KE -1.253030 36.856487 1645.3 {} 2 2018-10-29T09:13:33.768613Z 2 2022-07-26T07:34:06.603938Z
68 TA00074 1 2015-11-19T00:00:00Z 2.0 None None 2 2018-12-11T09:38:25.742397Z 2 2018-12-11T09:38:25.742397Z ... KE -0.566080 37.074412 1726.8 {} 2 2018-10-29T10:35:28.49617Z 2 2022-07-26T07:38:42.100985Z
74 TA00080 1 2016-01-28T00:00:00Z 2.0 None None 2 2018-12-11T09:43:10.523398Z 2 2018-12-11T09:43:10.523398Z ... KE -1.087589 36.818402 1777.3 {} 2 2018-10-29T10:53:47.845042Z 37 2022-02-28T13:07:04.709903Z
150 TA00166 1 2017-05-11T00:00:00Z 2.0 None None 2 2018-12-12T08:29:28.10697Z 2 2018-12-12T08:29:28.10697Z ... KE -0.319508 37.659139 1404.0 {} 2 2018-11-10T08:47:37.949135Z 2 2018-11-10T08:47:37.949135Z

6 rows × 28 columns



          

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

filter_stations-0.3.13.tar.gz (22.2 kB view hashes)

Uploaded Source

Built Distribution

filter_stations-0.3.13-py3-none-any.whl (19.2 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page