Data Quality Checks¶
Key Findings:
Currency fields are all stored as strings (
$XXX.XX) and need a conversion to float valuesmatch_amountcrowdfund_amount_contributions_usdtotal
There are null values in the
totalandcrowdfund_amount_contributions_usdfields that should be labeled0The
regioncolumn is not complete, with 44% of records (2600/5900) having a value ofnoneorundefinedThere is not a meaningful difference between these labels, so I am combining them and coding nulls as
noneSince there are so many missing values, this likely won’t be useful for analysis
from decimal import Decimal
from re import sub
import pandas as pd
from pandas_profiling import ProfileReport
# some data cleaning functions
def parseMoney(money):
money = Decimal(sub(r'[^\d.]', '', money))
return money
df = pd.read_csv('../data/grants_data_raw.csv')
# inspect it
df.head()
| round_number | round_start_date | round_end_date | grant_title | grant_id | region | category | url | match_amount | num_contributions | num_unique_contributors | crowdfund_amount_contributions_usd | total | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12 | 2021-12-01 | 2021-12-16 | Coin Center is educating policy makers about p... | 1668 | north_america | Community | https://gitcoin.co/grants/1668/coin-center-is-... | $340,000.00 | 6914 | 5727 | $103,838.93 | $443,838.93 |
| 1 | 12 | 2021-12-01 | 2021-12-16 | Electronic Frontier Foundation | 3974 | north_america | Community | https://gitcoin.co/grants/3974/electronic-fron... | $228,273.84 | 3648 | 3331 | $58,715.04 | $286,988.88 |
| 2 | 12 | 2021-12-01 | 2021-12-16 | The Tor Project | 2805 | undefined | Infra Tech | https://gitcoin.co/grants/2805/the-tor-project | $188,294.80 | 3497 | 3069 | $95,279.64 | $283,574.44 |
| 3 | 12 | 2021-12-01 | 2021-12-16 | Longevity Prize (by VitaDAO) | 4083 | europe | Grants Round 12 | https://gitcoin.co/grants/4083/longevity-prize... | $176,195.45 | 920 | 857 | $7,815.29 | $184,010.75 |
| 4 | 12 | 2021-12-01 | 2021-12-16 | Rotki - The portfolio tracker and accounting t... | 149 | europe | dApp Tech | https://gitcoin.co/grants/149/rotki-the-portfo... | $129,131.34 | 4684 | 4311 | $25,373.62 | $154,504.96 |
df.dtypes
round_number int64
round_start_date object
round_end_date object
grant_title object
grant_id int64
region object
category object
url object
match_amount object
num_contributions int64
num_unique_contributors int64
crowdfund_amount_contributions_usd object
total object
dtype: object
# parse currency fields into float columns
df['match_amount'] = pd.to_numeric(df['match_amount'].str.replace('[^.0-9]', ''))
df['crowdfund_amount_contributions_usd'] = pd.to_numeric(df['crowdfund_amount_contributions_usd'].str.replace('[^.0-9]', '')).fillna(0)
df['total'] = pd.to_numeric(df['total'].str.replace('[^.0-9]', '')).fillna(0)
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_34847/3482547301.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
df['match_amount'] = pd.to_numeric(df['match_amount'].str.replace('[^.0-9]', ''))
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_34847/3482547301.py:3: FutureWarning: The default value of regex will change from True to False in a future version.
df['crowdfund_amount_contributions_usd'] = pd.to_numeric(df['crowdfund_amount_contributions_usd'].str.replace('[^.0-9]', '')).fillna(0)
/var/folders/x4/w5cvp_v94hj1jb6h056qs6r80000gp/T/ipykernel_34847/3482547301.py:4: FutureWarning: The default value of regex will change from True to False in a future version.
df['total'] = pd.to_numeric(df['total'].str.replace('[^.0-9]', '')).fillna(0)
# recode the region
df['region'] = df['region'].replace('undefined', 'none').fillna('none')
df.describe()
| round_number | grant_id | match_amount | num_contributions | num_unique_contributors | crowdfund_amount_contributions_usd | total | |
|---|---|---|---|---|---|---|---|
| count | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 | 5906.000000 |
| mean | 9.399763 | 1449.218591 | 1146.815586 | 228.686590 | 186.696072 | 1584.362587 | 2731.178214 |
| std | 2.629641 | 1104.701308 | 7916.396923 | 934.435539 | 736.134848 | 10681.589561 | 15436.351606 |
| min | 1.000000 | 12.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 8.000000 | 470.250000 | 0.000000 | 2.000000 | 2.000000 | 2.452500 | 3.000000 |
| 50% | 10.000000 | 1240.500000 | 0.660000 | 9.000000 | 8.000000 | 42.395000 | 50.205000 |
| 75% | 12.000000 | 2202.000000 | 59.182500 | 57.000000 | 47.000000 | 504.605000 | 675.857500 |
| max | 12.000000 | 4367.000000 | 340000.000000 | 22240.000000 | 12317.000000 | 652919.460000 | 674332.580000 |
# profile the data
profile = ProfileReport(df, title="Grants Data Profile")
profile
# write out the clean data
df.to_csv('../data/grants_data.csv', index=False)