Skip to main content

Intuitive way of using fuzz matching in pandas

Project description

Intuitive way of using fuzz matching in pandas

Updates

05.10.2022 - Added compare rows

Installation

#Try it first like this: 
#rapidfuzz is a lot faster than fuzzywuzzy, but I had some problems installing it, #even with Visual C++ 2019 redistributable installed   a-pandas-ex-fuzz will try to import this module first
pip install a-pandas-ex-plode-tool
pip install a-pandas-ex-df-to-string
pip install rapidfuzz #https://github.com/maxbachmann/RapidFuzz
pip install --no-deps a-pandas-ex-fuzz

#if rapidfuzz does not work, use:
pip install a-pandas-ex-plode-tool
pip install a-pandas-ex-df-to-string
pip install fuzzywuzzy 
pip install --no-deps a-pandas-ex-fuzz


 #Or if you want to try to install everything:
 pip install a-pandas-ex-fuzz

Compare values in column against each other: pandas.Series.s_fuzz_all_values_in_col()

from a_pandas_ex_fuzz import pd_add_fuzzy_matching
pd_add_fuzzy_matching() #adds three new methods to pd.   
import pandas as pd


df = pd.read_csv(
        "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
    )  
df11 = df.Name.s_fuzz_all_values_in_column(
    limit=5, merge_with_series=True, partial_full_weighted="weighted"
)
df22 = df.Name.s_fuzz_all_values_in_column(
    limit=2, merge_with_series=False, partial_full_weighted="full"
)
df33 = df.Name.s_fuzz_all_values_in_column(
    limit=1, merge_with_series=True, partial_full_weighted="partial"
)

df22

    0  Braund...     70.833333          477    Cann, ...     63.829787
1  Angle,...     55.445545          518    Astor,...     53.061224
2  Sinkko...     79.069767          747    Honkan...     77.272727
3  Futrel...     77.142857          137    Potter...     52.873563
4  Gilles...     84.615385          722    Saunde...     77.777778
5  Bracke...     77.777778          221    Scanla...     76.470588
6  O'Brie...     65.116279          552    Maisne...     58.536585
7  Goodwi...     68.852459          386    Palsso...     67.857143
8  Rosblo...     62.068966          254    Hockin...      59.52381
9  Nasser...     74.074074          122    Astor,...     58.536585
  fuzz_index_1
0         37
1        700
2        216
3        879
4         12
5        468
6        464
7        374
8        774
9        700

    Parameters:
        df: [pd.Series]
        limit: int
            How many results do you want to have?
            Each result will have 3 columns [string, match, position in column]
            (default=5)
        partial_full_weighted: str
            weighted = fuzz.WRatio
            full = fuzz.ratio
            partial = fuzz.partial_ratio
            (default="weighted")
        merge_with_series: str
            (default=True)
    Returns:
        pd.DataFrame

Compare values in column against list: pandas.Series.s_fuzz_from_list()

from a_pandas_ex_fuzz import pd_add_fuzzy_matching
pd_add_fuzzy_matching() #adds three new methods to pd.   
import pandas as pd   

df = pd.read_csv(
        "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
    ) 

df111 = df.Name.s_fuzz_from_list(
    list_to_compare=["Johannes", "Paulo", "Kevin"],
    limit=2,
    merge_with_series=True,
    partial_full_weighted="partial",
)
df222 = df.Name.s_fuzz_from_list(
    list_to_compare=["John", "Johannes", "Paulo", "Kevin"],
    limit=3,
    merge_with_series=False,
    partial_full_weighted="full",
)
df333 = df.Name.s_fuzz_from_list(
    list_to_compare=["Maria", "Anna"],
    limit=1,
    merge_with_series=False,
    partial_full_weighted="partial",
)
df333
        fuzz_string_0 fuzz_match_0 fuzz_index_0
0           Maria         60.0            0
1           Maria    44.444444            0
2            Anna         75.0            1
3           Maria         40.0            0
4           Maria         40.0            0
..            ...          ...          ...
886         Maria         40.0            0
887         Maria         80.0            0
888         Maria         60.0            0
889         Maria         40.0            0
890         Maria         60.0            0
[891 rows x 3 columns]

    Parameters:
        df: [pd.Series]
        list_to_compare: list
            The strings you want to be compared
        limit: int
            How many results do you want to have?
            Each result will have 3 columns [string, match, position in column]
            (default=5)
        partial_full_weighted: str
            weighted = fuzz.WRatio
            full = fuzz.ratio
            partial = fuzz.partial_ratio
            (default="weighted")
        merge_with_series: str
            (default=True)
    Returns:
        pd.DataFrame

Compare values in column against list: pandas.Series.s_fuzz_one_word()

from a_pandas_ex_fuzz import pd_add_fuzzy_matching
pd_add_fuzzy_matching() #adds three new methods to pd.   
import pandas as pd   

df = pd.read_csv(
        "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
    ) 

df1 = df.Name.s_fuzz_one_word(
word_to_search="Karolina", partial_full_weighted="weighted"
)
df2 = df.Name.s_fuzz_one_word(word_to_search="Karolina", partial_full_weighted="full")
df3 = df.Name.s_fuzz_one_word(
    word_to_search="Karolina", partial_full_weighted="partial"
)
df1
                                                  Name fuzz_string_0  \
0                              Braund, Mr. Owen Harris      Karolina
1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)      Karolina
2                               Heikkinen, Miss. Laina      Karolina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)      Karolina
4                             Allen, Mr. William Henry      Karolina
5                                     Moran, Mr. James      Karolina
6                              McCarthy, Mr. Timothy J      Karolina
7                       Palsson, Master. Gosta Leonard      Karolina
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)      Karolina
9                  Nasser, Mrs. Nicholas (Adele Achem)      Karolina
   fuzz_match_0
0     41.538462
1     33.750000
2     60.000000
3     33.750000
4     42.750000
5     30.000000
6     27.692308
7     45.000000
8     45.600000
9     42.750000

df2
                                                  Name fuzz_string_0  \
0                              Braund, Mr. Owen Harris      Karolina
1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)      Karolina
2                               Heikkinen, Miss. Laina      Karolina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)      Karolina
4                             Allen, Mr. William Henry      Karolina
5                                     Moran, Mr. James      Karolina
6                              McCarthy, Mr. Timothy J      Karolina
7                       Palsson, Master. Gosta Leonard      Karolina
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)      Karolina
9                  Nasser, Mrs. Nicholas (Adele Achem)      Karolina
   fuzz_match_0
0     32.258065
1     17.241379
2     33.333333
3     15.686275
4     31.250000
5     25.000000
6     19.354839
7     31.578947
8     21.428571
9     23.809524

df3
                                                  Name fuzz_string_0  \
0                              Braund, Mr. Owen Harris      Karolina
1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)      Karolina
2                               Heikkinen, Miss. Laina      Karolina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)      Karolina
4                             Allen, Mr. William Henry      Karolina
5                                     Moran, Mr. James      Karolina
6                              McCarthy, Mr. Timothy J      Karolina
7                       Palsson, Master. Gosta Leonard      Karolina
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)      Karolina
9                  Nasser, Mrs. Nicholas (Adele Achem)      Karolina
   fuzz_match_0
0     46.153846
1     37.500000
2     66.666667
3     37.500000
4     46.153846
5     33.333333
6     30.769231
7     50.000000
8     50.000000
9     40.000000

    Parameters:
        df: [pd.Series]
        word_to_search: str
        partial_full_weighted: str
            weighted = fuzz.WRatio
            full = fuzz.ratio
            partial = fuzz.partial_ratio
            (default="weighted")
    Returns:
        pd.DataFrame

pandas.Series.ds_fuzz_compare_row_to_others/ pandas.DataFrame.ds_fuzz_compare_row_to_others

    from a_pandas_ex_fuzz import pd_add_fuzzy_matching
    pd_add_fuzzy_matching()
    import pandas as pd
    df = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
    df.ds_fuzz_compare_row_to_others(2,loc_or_iloc='iloc', partial_full_weighted='full', sort_values=True)    
    
    
    Out[4]:   
    
         PassengerId  Survived  Pclass  ... Cabin Embarked  aa_fuzz_match
    2              3         1       3  ...   NaN        S     100.000000
    216          217         1       3  ...   NaN        S      90.816327
    816          817         0       3  ...   NaN        S      88.118812
    382          383         0       3  ...   NaN        S      83.769634
    400          401         1       3  ...   NaN        S      83.769634
    ..           ...       ...     ...  ...   ...      ...            ...
    745          746         0       1  ...   B22        S      54.450262
    556          557         1       1  ...   A16        C      53.744493
    581          582         1       1  ...   C68        C      53.456221
    669          670         1       1  ...  C126        S      52.132701
    307          308         1       1  ...   C65        C      51.612903
    [891 rows x 13 columns]

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

a_pandas_ex_fuzz-0.13.tar.gz (9.5 kB view hashes)

Uploaded Source

Built Distribution

a_pandas_ex_fuzz-0.13-py3-none-any.whl (12.0 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page