Skip to main content

Making it easier to navigate and clean station data

Project description

Documentation

https://filter-stations.netlify.app/

Water Level Pipeline

  • A series of functions to be added to the filter-stations module in pypi to evalute which TAHMO stations to use that corroborates with the water level
  • All begins with the coordinates of the gauging station(location of the monitoring sensor)
import os
from pathlib import Path
import haversine as hs
import pandas as pd
import numpy as np
import datetime
import statsmodels.api as sm
from matplotlib.dates import DateFormatter
import matplotlib.pyplot as plt
import warnings
import dateutil.parser
warnings.filterwarnings('ignore')

# config_path
config_path = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'config.json')
from filter_stations import retreive_data, Interactive_maps, Filter, pipeline
import json
# Authentication
with open(config_path) as f:
    conf = json.load(f)

apiKey = conf['apiKey']
apiSecret = conf['apiSecret']
map_api_key = conf['map_api_key']
fs = retreive_data(apiKey, apiSecret, map_api_key)
pipe = pipeline(apiKey, apiSecret, map_api_key)
maps = Interactive_maps(apiKey, apiSecret, map_api_key)

Loading data

Load the water level data from the github repository[Link here]
Load the TAHMO station data from the [Link here]

# muringato 
muringato_loc = [-0.406689, 36.96301]  
# ewaso 
ewaso_loc = [0.026833, 36.914637]

# Weather stations data
weather_stations_data = pd.read_csv(os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'stations_precipitation.csv'))

''' The water level data '''
# muringato data sensor 2 2021
muringato_data_s2_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2.csv')

# muringato data sensor 2 2022
muringato_data_s2_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor2-2022.csv')

# muringato data sensor 6 2021
muringato_data_s6_2021 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6.csv')

# muringato data sensor 6 2022
muringato_data_s6_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water_data_2021', 'muringato-sensor6-2022.csv')


# ewaso data sensor 2020 convert the time column to datetime
ewaso_data_2020 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2020.csv')

# ewaso data sensor 2022
ewaso_data_2022 = os.path.join(Path(os.getcwd()).parent.parent.absolute(), 'data', 'water-level-data-ewaso', '1E2022.csv')

weather_stations_data.Date = weather_stations_data.Date.astype('datetime64[ns]')
weather_stations_data.set_index('Date', inplace=True)

To format water level it needs to have a time column and water level column the names can be different but the order must be that

# handle the water level data
def format_water_level(water_level_data_path):
    # data needs to be in the format time, data/water_level or whatever the column is called
    water_level_data = pd.read_csv(water_level_data_path)
    # rename the first column to time
    water_level_data.rename(columns={water_level_data.columns[0]: 'time'}, inplace=True)
    # convert the time column to datetime
    water_level_data.time = pd.to_datetime([dateutil.parser.parse(i).strftime('%d-%m-%Y') for i in water_level_data['time']])
    water_level_data.time = water_level_data.time.astype('datetime64[ns]')
    # rename the column to water_level
    water_level_data.rename(columns={water_level_data.columns[1]: 'water_level'}, inplace=True)
    # set the time column as the index
    water_level_data.set_index('time', inplace=True)
    return water_level_data
muringato_data_s2_2021 = format_water_level(muringato_data_s2_2021)
muringato_data_s2_2022 = format_water_level(muringato_data_s2_2022)
muringato_data_s6_2021 = format_water_level(muringato_data_s6_2021)
muringato_data_s6_2022 = format_water_level(muringato_data_s6_2022)
ewaso_data_2020 = format_water_level(ewaso_data_2020)
ewaso_data_2022 = format_water_level(ewaso_data_2022)
  1. Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
  2. Choose stations within a certain radius of the gauging station 100 km for example get the resulting weather data
  3. Get the stations with only 100 percent data no missing data
  4. Remove the stations data with the value zero from beginning to end if the water level data has some values above zero
  5. Calculate the correlation between the water level data and the weather data needs to be above 0 and have a lag of maximum 3 days
  6. Plot the resulting figures

Choosing ewaso 2020 range

removing stations with missing data reduces from 1035 to 849 columns
removing all zeros reduces from 849 to 604 columns
columns with positive correlation reduces the number from 604 columns to 283 columns
checking for lag reduces the columns to 80

above, below = pipe.shed_stations(weather_stations_data,
                   muringato_data_s6_2022,
                   muringato_loc,
                   100,
                   lag=3
                   )
below_stations = [i.split('_')[0] for i in below.keys()]
print(below_stations)
below_stations_metadata = fs.get_stations_info(multipleStations=below_stations)[['code', 'location.latitude', 'location.longitude']]
['TA00001', 'TA00023', 'TA00024', 'TA00025', 'TA00054', 'TA00056', 'TA00067', 'TA00077', 'TA00129', 'TA00147', 'TA00154', 'TA00155', 'TA00156', 'TA00166', 'TA00171', 'TA00189', 'TA00215', 'TA00222', 'TA00228', 'TA00230', 'TA00233', 'TA00250', 'TA00270', 'TA00270', 'TA00272', 'TA00272', 'TA00316', 'TA00317', 'TA00355', 'TA00459', 'TA00473', 'TA00480', 'TA00493', 'TA00494', 'TA00577', 'TA00601', 'TA00621', 'TA00653', 'TA00672', 'TA00676', 'TA00679', 'TA00692', 'TA00699', 'TA00704', 'TA00705', 'TA00711', 'TA00712', 'TA00712', 'TA00715', 'TA00717', 'TA00750', 'TA00751', 'TA00767']
below_stations_metadata['distance']= below_stations_metadata.apply(lambda row: hs.haversine((muringato_loc[0], 
                                                                                             muringato_loc[1]), (row['location.latitude'], 
                                                                                                             row['location.longitude'])), axis=1)
below_stations_metadata.sort_values(by='distance')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
code location.latitude location.longitude distance
52 TA00056 -0.721656 37.145585 40.480889
22 TA00024 -1.071731 37.045578 74.517013
150 TA00166 -0.319508 37.659139 78.009238
172 TA00189 -0.795260 37.665930 89.304790
230 TA00250 -0.778940 37.676738 89.504935
600 TA00715 -1.225618 36.809065 92.655456
565 TA00679 -1.270835 36.723916 99.698089
23 TA00025 -1.301839 36.760200 102.058383
422 TA00473 -0.512371 35.956813 112.495996
513 TA00621 -1.633020 37.146185 137.874253
51 TA00054 -0.239342 35.728897 138.480985
424 TA00480 -1.376152 37.797646 142.238019
61 TA00067 -1.794285 37.621211 170.765765
140 TA00156 -1.701123 38.068339 189.255406
71 TA00077 -0.383066 35.068406 210.682047
139 TA00155 -2.523037 36.829437 235.795373
21 TA00023 -2.388550 38.040767 250.831198
155 TA00171 -0.002710 34.596908 266.903936
291 TA00317 0.040440 34.371716 292.394991
0 TA00001 -1.123283 34.397992 296.112467
652 TA00767 -2.671990 38.369665 296.467402
290 TA00316 0.289862 34.371222 298.418648
131 TA00147 0.449274 34.282303 312.905564
117 TA00129 -3.390926 37.717656 342.264311
138 TA00154 -4.231107 37.847804 436.466702
211 TA00230 1.724690 33.622000 440.623881
329 TA00355 3.498069 35.843897 451.651266
544 TA00653 0.265062 32.627203 487.869319
196 TA00215 0.052465 32.440690 505.441217
203 TA00222 1.186240 32.020330 577.409865
584 TA00699 -0.707570 31.402138 619.216128
558 TA00672 -6.180302 37.146832 642.321296
597 TA00712 -6.676308 39.131552 737.484276
562 TA00676 -6.780374 38.973512 742.978650
635 TA00750 -6.805316 39.139843 751.347364
636 TA00751 -6.848668 39.082174 753.892793
432 TA00494 -6.833860 39.167475 755.338586
248 TA00270 -6.842390 39.156760 755.852180
250 TA00272 -6.890039 39.117927 759.501414
431 TA00493 -6.910845 39.075597 760.236606
214 TA00233 3.453500 31.251250 766.277105
209 TA00228 3.404720 30.959600 790.422401
498 TA00601 -14.080148 33.907593 1557.147407
602 TA00717 3.898305 11.886437 2827.236339
590 TA00705 4.952251 8.341692 3234.191975
481 TA00577 10.487147 9.788223 3240.086078
589 TA00704 5.378602 6.998292 3388.907422
596 TA00711 4.906530 6.917064 3389.011984
410 TA00459 9.066148 6.569080 3526.820348
577 TA00692 6.404114 5.626307 3559.025765
# Interactive visuals
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter_mapbox(below_stations_metadata, 
                        lat="location.latitude", 
                        lon="location.longitude", 
                        hover_name="code", 
                        hover_data=["distance"],
                        color_discrete_sequence=["fuchsia"],
                        zoom=8,
                        height=800,
                        )
# update marker size
fig.update_traces(marker=dict(size=10))
# add a point for the central station
fig.add_trace(go.Scattermapbox(
        lat=[muringato_loc[0]],
        lon=[muringato_loc[1]],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=14
        ),
        text=['Muringato gauging station'],
    ))

fig.update_layout(
    mapbox_style="carto-positron",
    margin={"r":0,"t":0,"l":0,"b":0},
    showlegend=False
)
fig.show()
pipe.plot_figs(
    weather_stations_data,
    list(muringato_data_s6_2022['water_level']),
    list(below.keys()),
    date=dateutil.parser.parse(str(muringato_data_s6_2022.index[0])).strftime('%d-%m-%Y'), 
    save=False   
)
Begin plotting!

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

RADIUS = 100

ewaso_weather_data_2020 = weather_stations_data.loc[ewaso_data_2020.index[0]:ewaso_data_2020.index[-1]]
# ewaso stations within a particular radius
ewaso_tahmo_stations_2020 = pipe.stations_within_radius(RADIUS, ewaso_loc[0], ewaso_loc[1], df=False)
# Get stations without missing data
# ewaso weather data
ewaso_weather_data_2020_filtered = pipe.stations_data_check(stations_list=list(ewaso_tahmo_stations_2020), 
                                              percentage=1, data=ewaso_weather_data_2020
                                              )
# Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
ewaso_weather_data_2020_filtered = ewaso_weather_data_2020_filtered.loc[:, ewaso_weather_data_2020_filtered.sum() != 0]
API request: services/assets/v2/stations
import statsmodels.api as sm
def calculate_lag(weather_stations_data, water_level_data, lag=3, above=None, below=None):
    above_threshold_lag = dict()
    below_threshold_lag = dict()
    for cols in weather_stations_data.columns:
        # check for positive correlation if not skip the column
        if weather_stations_data[cols].corr(water_level_data['water_level']) <= 0:
            continue
        # get the lag and the coefficient for columns with a positive correlation
        coefficient_list = list(sm.tsa.stattools.ccf(weather_stations_data[cols], water_level_data['water_level']))    
        a = np.argmax(coefficient_list)
        b = coefficient_list[a] 
        # print(f'{cols} has a lag of {a}')
        # print(f'{cols} has a coefficient of {b}')
        # print('-----------------------')
        if a > lag:
            above_threshold_lag[cols] = a
        elif a <= lag:
            below_threshold_lag[cols] = a
    if above:
        return above_threshold_lag
    elif below:
        return below_threshold_lag
    else:
        return above_threshold_lag, below_threshold_lag

Bringing all the functions together to create a pipeline

def shed_stations(weather_stations_data, water_level_data,
                  gauging_station_coords, radius, lag=3,
                  percentage=1, above=None, below=None):
    # Filter the date range based on the water level data from first day of the water level data to the last day of the water level data
    weather_stations_data = weather_stations_data.loc[water_level_data.index[0]:water_level_data.index[-1]]
    # Filter the weather stations based on the radius
    lat, lon = gauging_station_coords[0], gauging_station_coords[1]
    weather_stations_data_list = pipe.stations_within_radius(radius, lat, lon, df=False)
    # get stations without missing data or the percentage of stations with missing data
    weather_stations_data_filtered = pipe.stations_data_check(stations_list=weather_stations_data_list,
                                                              percentage=percentage,
                                                              data=weather_stations_data)
    # Check the sum of each column and drop columns with a sum of zero this is if the sum of water level is not equal to zero
    weather_stations_data_filtered = weather_stations_data_filtered.loc[:, weather_stations_data_filtered.sum() != 0]

    # Filter the weather stations based on the lag and positive correlation
    above_threshold_lag, below_threshold_lag = calculate_lag(weather_stations_data_filtered, water_level_data, lag=lag)

    return above_threshold_lag, below_threshold_lag
above_threshold_lag, below_threshold_lag = shed_stations(weather_stations_data, ewaso_data_2020, ewaso_loc, RADIUS, lag=3, percentage=1, above=True, below=False)
len(below_threshold_lag)
API request: services/assets/v2/stations





80

Plot the figures

pipe.plot_figs(
    weather_stations_data,
    list(ewaso_data_2020['water_level']),
    list(below_threshold_lag.keys()),
    date=dateutil.parser.parse(str(ewaso_data_2020.index[0])).strftime('%d-%m-%Y'), 
    save=True   
)
Begin plotting!

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

png

Input water level data
Input TAHMO station data

# plot the two with different colors
fig, ax = plt.subplots(figsize=(10, 10))
muringato_tahmo_stations.plot(kind='scatter',
                            x='location.longitude',
                            y='location.latitude',
                            color='blue',
                            alpha=0.7,
                            ax=ax)
ewaso_tahmo_stations.plot(kind='scatter',
                            x='location.longitude',
                            y='location.latitude',
                            color='red',
                            alpha=0.7,
                            ax=ax)
plt.show()

png

Apart from the completeness another method of validation by eliminating unusable sensors is checking for a positive correlation and lag

  • The default lag is 3 days between a particular station and the gauging station
  • The required format is a timeseries data
  • Provide the column names for evaluation format = [Date, data]
  • with the change in parameters one can choose above or below threshold
def plot_figs(weather_stations, water_list, threshold_list, save=False, dpi=500, date='11-02-2021'):
    start_date = datetime.datetime.strptime(date, "%d-%m-%Y")
    end_date = start_date + datetime.timedelta(len(water_list)-1)
    # weather_stations = weather_stations.set_index('Date')
    df_plot = weather_stations[start_date:end_date]
    df_plot = df_plot[threshold_list].reset_index()
    df_plot.rename(columns={'index':'Date'}, inplace=True)
    
    
    plt.rcParams['figure.figsize'] = (15, 9)
    print('Begin plotting!')
    
    for cols in df_plot.columns[1:]:
        fig, ax1 = plt.subplots()
        color = 'tab:blue'
        ax1.set_xlabel(f'Time', fontsize=24, weight='bold')
        ax1.set_ylabel(f'Rainfall {cols} (mm)', color=color, fontsize=24, weight='bold')
        ax1.bar(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), df_plot[f'{cols}'], color=color, width=4, alpha=1.0)
        ax1.tick_params(axis='y', labelcolor=color, labelsize=24)
        ax1.tick_params(axis='x')
        ax1.set_xticklabels(df_plot['Date'], fontsize=21, weight='bold')
        ax1.grid(color='gray', linestyle='--', linewidth=0.8)
        ax1.set(facecolor="white")
        ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

        color = 'tab:red'
        ax2.set_ylabel('Water level/Stage (m)', color=color, fontsize=24, weight='bold')
        ax2.plot(pd.to_datetime(df_plot['Date'], format="%d/%m/%Y"), water_list, color=color, linewidth=4)
        ax2.tick_params(axis='y', labelcolor=color, labelsize=24)
        ax2.set(facecolor="white")
        plt.title('Stage and Rainfall against Time', fontsize=22, weight='bold')

        date_form = DateFormatter("%m-%y")
        ax1.xaxis.set_major_formatter(date_form)
        fig.tight_layout()

        if save:
            fig.savefig(f'{cols}.png', dpi=dpi)
plot_figs(stations_df, lag_[list(lag_.keys())[0]]['water_list'], list(lag_.keys()), save=True, date='12-05-2020')
Begin plotting!

png

png

png

png

png

png

png

png

png

Format to get the stations maetadata

def filter_metadata(lag_keys):
    captured_list = [i.split('_')[0] for i in list(lag_keys)]
    return fs.get_stations_info(multipleStations=captured_list)
filter_metadata(list(lag_.keys()))
API request: services/assets/v2/stations
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
code status installationdate elevationground sensorinstallations dataloggerinstallations creatorid created updaterid updated ... location.countrycode location.zipcode location.latitude location.longitude location.elevationmsl location.note location.creatorid location.created location.updaterid location.updated
26 TA00028 1 2015-08-31T00:00:00Z 9.0 None None 2 2018-12-11T08:35:17.888233Z 2 2018-12-11T08:35:17.888233Z ... KE 0.055219 37.136747 2003.6 {} 2 2018-10-26T13:32:16.15537Z 37 2022-06-30T11:11:50.27135Z
27 TA00029 1 2015-09-02T00:00:00Z 2.0 None None 2 2018-12-11T08:36:19.30342Z 2 2018-12-11T08:36:19.30342Z ... KE -0.500776 36.587511 2545.8 {} 2 2018-10-26T13:33:31.451613Z 37 2022-02-28T12:25:09.578242Z
53 TA00057 1 2015-10-08T00:00:00Z 2.0 None None 2 2018-12-11T09:21:29.092833Z 2 2018-12-11T09:21:29.092833Z ... KE -1.253030 36.856487 1645.3 {} 2 2018-10-29T09:13:33.768613Z 2 2022-07-26T07:34:06.603938Z
68 TA00074 1 2015-11-19T00:00:00Z 2.0 None None 2 2018-12-11T09:38:25.742397Z 2 2018-12-11T09:38:25.742397Z ... KE -0.566080 37.074412 1726.8 {} 2 2018-10-29T10:35:28.49617Z 2 2022-07-26T07:38:42.100985Z
74 TA00080 1 2016-01-28T00:00:00Z 2.0 None None 2 2018-12-11T09:43:10.523398Z 2 2018-12-11T09:43:10.523398Z ... KE -1.087589 36.818402 1777.3 {} 2 2018-10-29T10:53:47.845042Z 37 2022-02-28T13:07:04.709903Z
150 TA00166 1 2017-05-11T00:00:00Z 2.0 None None 2 2018-12-12T08:29:28.10697Z 2 2018-12-12T08:29:28.10697Z ... KE -0.319508 37.659139 1404.0 {} 2 2018-11-10T08:47:37.949135Z 2 2018-11-10T08:47:37.949135Z

6 rows × 28 columns



          

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

filter_stations-0.4.3.tar.gz (37.8 kB view hashes)

Uploaded Source

Built Distribution

filter_stations-0.4.3-py3-none-any.whl (24.2 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page