Basic Statistics
Basic Statistics#
This Notebook shows us how to use the pandas
to find
basic quantitative descriptions of our data
Topics in this Notebook:
dropping columns
minimum and maximum ranges
averages
counts
sorting data
correlations with
corr()
describe()
# import schools from the nycschool package
from nycschools import schools
# load the demographic data into a `DataFrame` called df
df = schools.load_school_demographics()
# let's just use one year of data
df = df[df.ay == 2020]
# use a subset of columns for this notebook
cols = [
'dbn',
'district',
'boro',
'school_name',
'total_enrollment',
'asian_pct',
'black_pct',
'hispanic_pct',
'white_pct',
'swd_pct',
'ell_pct',
'poverty_pct'
]
df = df[cols]
df.head()
dbn | district | boro | school_name | total_enrollment | asian_pct | black_pct | hispanic_pct | white_pct | swd_pct | ell_pct | poverty_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 01M015 | 1 | Manhattan | P.S. 015 Roberto Clemente | 193 | 0.135 | 0.275 | 0.528 | 0.057 | 0.223 | 0.109 | 0.819 |
9 | 01M019 | 1 | Manhattan | P.S. 019 Asher Levy | 212 | 0.061 | 0.193 | 0.613 | 0.080 | 0.392 | 0.042 | 0.712 |
14 | 01M020 | 1 | Manhattan | P.S. 020 Anna Silver | 412 | 0.248 | 0.133 | 0.522 | 0.073 | 0.218 | 0.119 | 0.709 |
19 | 01M034 | 1 | Manhattan | P.S. 034 Franklin D. Roosevelt | 273 | 0.026 | 0.381 | 0.557 | 0.029 | 0.392 | 0.062 | 0.960 |
24 | 01M063 | 1 | Manhattan | The STAR Academy - P.S.63 | 208 | 0.029 | 0.192 | 0.635 | 0.091 | 0.279 | 0.014 | 0.769 |
# sort the data and show just the 10 largest schools
# sort in descending order (biggest --> smallest)
data = df.sort_values(by="total_enrollment", ascending=False)
# show the first 10 rows
data[:10]
dbn | district | boro | school_name | total_enrollment | asian_pct | black_pct | hispanic_pct | white_pct | swd_pct | ell_pct | poverty_pct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
3723 | 13K430 | 13 | Brooklyn | Brooklyn Technical High School | 5921 | 0.599 | 0.056 | 0.066 | 0.237 | 0.016 | 0.001 | 0.590 |
5310 | 20K490 | 20 | Brooklyn | Fort Hamilton High School | 4678 | 0.290 | 0.025 | 0.317 | 0.347 | 0.153 | 0.162 | 0.706 |
6686 | 26Q430 | 26 | Queens | Francis Lewis High School | 4424 | 0.577 | 0.047 | 0.210 | 0.150 | 0.150 | 0.115 | 0.690 |
5802 | 22K405 | 22 | Brooklyn | Midwood High School | 4109 | 0.354 | 0.255 | 0.133 | 0.225 | 0.138 | 0.036 | 0.723 |
5807 | 22K425 | 22 | Brooklyn | James Madison High School | 3851 | 0.209 | 0.128 | 0.175 | 0.472 | 0.154 | 0.121 | 0.758 |
7279 | 28Q440 | 28 | Queens | Forest Hills High School | 3775 | 0.247 | 0.065 | 0.381 | 0.272 | 0.164 | 0.093 | 0.702 |
8199 | 31R455 | 31 | Staten Island | Tottenville High School | 3726 | 0.072 | 0.014 | 0.132 | 0.761 | 0.227 | 0.023 | 0.416 |
5557 | 21K525 | 21 | Brooklyn | Edward R. Murrow High School | 3691 | 0.274 | 0.176 | 0.193 | 0.308 | 0.178 | 0.125 | 0.670 |
5300 | 20K445 | 20 | Brooklyn | New Utrecht High School | 3572 | 0.391 | 0.030 | 0.336 | 0.234 | 0.165 | 0.218 | 0.796 |
6681 | 26Q415 | 26 | Queens | Benjamin N. Cardozo High School | 3405 | 0.423 | 0.214 | 0.223 | 0.120 | 0.145 | 0.051 | 0.668 |
# get just the total_enrollment column, called a Series in pandas
enrollment = df["total_enrollment"]
print("The largest school:", enrollment.max())
print("The smallest school:", enrollment.min())
print("Avg (mean) school size:", enrollment.mean())
print("Avg (median) school size:", enrollment.median())
print("Avg (mode, can return multiple values) school size:", list(enrollment.mode()))
The largest school: 5921
The smallest school: 7
Avg (mean) school size: 560.79296875
Avg (median) school size: 460.5
Avg (mode, can return multiple values) school size: [479, 714]
# the built in describe() function calculates several descriptive statististics for each column
# in the data frame and returns them as a new dataframe
df.describe()
district | total_enrollment | asian_pct | black_pct | hispanic_pct | white_pct | swd_pct | ell_pct | poverty_pct | |
---|---|---|---|---|---|---|---|---|---|
count | 2048.00000 | 2048.000000 | 2048.000000 | 2048.000000 | 2048.000000 | 2048.000000 | 2048.000000 | 2048.000000 | 2048.000000 |
mean | 27.62207 | 560.792969 | 0.117710 | 0.302644 | 0.431721 | 0.118602 | 0.237641 | 0.141212 | 0.766560 |
std | 26.68098 | 461.014510 | 0.166804 | 0.256375 | 0.246118 | 0.168041 | 0.160780 | 0.143127 | 0.194297 |
min | 1.00000 | 7.000000 | 0.000000 | 0.000000 | 0.015000 | 0.000000 | 0.000000 | 0.000000 | 0.050000 |
25% | 10.00000 | 304.000000 | 0.015000 | 0.081474 | 0.206000 | 0.016000 | 0.163000 | 0.049000 | 0.711000 |
50% | 19.00000 | 460.500000 | 0.046000 | 0.245000 | 0.405000 | 0.036000 | 0.208000 | 0.100000 | 0.827000 |
75% | 30.00000 | 663.250000 | 0.148000 | 0.472250 | 0.631250 | 0.145250 | 0.261000 | 0.187250 | 0.903000 |
max | 84.00000 | 5921.000000 | 0.926000 | 0.935000 | 1.000000 | 0.945000 | 1.000000 | 1.000000 | 0.960000 |
# we can also call describe aon a single series:
df.swd_pct.describe()
count 2048.000000
mean 0.237641
std 0.160780
min 0.000000
25% 0.163000
50% 0.208000
75% 0.261000
max 1.000000
Name: swd_pct, dtype: float64
# we can also call the corr() method to show correclations between columns
# we will take out "district" from this data because the district number
# is categorical -- not the measure of a value
# correlations close to 1 or negative one show high correlations
# closer to zero items are not closely correlated
data = df.drop(columns=["district"])
data.corr()
/tmp/ipykernel_4218/792112253.py:8: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
data.corr()
total_enrollment | asian_pct | black_pct | hispanic_pct | white_pct | swd_pct | ell_pct | poverty_pct | |
---|---|---|---|---|---|---|---|---|
total_enrollment | 1.000000 | 0.350881 | -0.242553 | -0.111551 | 0.179056 | -0.175962 | -0.018582 | -0.155614 |
asian_pct | 0.350881 | 1.000000 | -0.452366 | -0.370264 | 0.210809 | -0.210489 | 0.138261 | -0.287675 |
black_pct | -0.242553 | -0.452366 | 1.000000 | -0.416505 | -0.450801 | 0.138008 | -0.362794 | 0.299685 |
hispanic_pct | -0.111551 | -0.370264 | -0.416505 | 1.000000 | -0.396739 | 0.070421 | 0.441878 | 0.490632 |
white_pct | 0.179056 | 0.210809 | -0.450801 | -0.396739 | 1.000000 | -0.083534 | -0.177996 | -0.795595 |
swd_pct | -0.175962 | -0.210489 | 0.138008 | 0.070421 | -0.083534 | 1.000000 | 0.000117 | 0.233832 |
ell_pct | -0.018582 | 0.138261 | -0.362794 | 0.441878 | -0.177996 | 0.000117 | 1.000000 | 0.359285 |
poverty_pct | -0.155614 | -0.287675 | 0.299685 | 0.490632 | -0.795595 | 0.233832 | 0.359285 | 1.000000 |
# last, we can use styles to make the correlation table easier to read
# note: you need to run this cell to see the colors -- it's get saved without the styled output
corr = data.corr()
# a coolwarm color map will show values in a gradient where -1 is the deepest blue and 1 is deepest red
corr.style.background_gradient(cmap='coolwarm')
/tmp/ipykernel_4218/143913075.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
corr = data.corr()
total_enrollment | asian_pct | black_pct | hispanic_pct | white_pct | swd_pct | ell_pct | poverty_pct | |
---|---|---|---|---|---|---|---|---|
total_enrollment | 1.000000 | 0.350881 | -0.242553 | -0.111551 | 0.179056 | -0.175962 | -0.018582 | -0.155614 |
asian_pct | 0.350881 | 1.000000 | -0.452366 | -0.370264 | 0.210809 | -0.210489 | 0.138261 | -0.287675 |
black_pct | -0.242553 | -0.452366 | 1.000000 | -0.416505 | -0.450801 | 0.138008 | -0.362794 | 0.299685 |
hispanic_pct | -0.111551 | -0.370264 | -0.416505 | 1.000000 | -0.396739 | 0.070421 | 0.441878 | 0.490632 |
white_pct | 0.179056 | 0.210809 | -0.450801 | -0.396739 | 1.000000 | -0.083534 | -0.177996 | -0.795595 |
swd_pct | -0.175962 | -0.210489 | 0.138008 | 0.070421 | -0.083534 | 1.000000 | 0.000117 | 0.233832 |
ell_pct | -0.018582 | 0.138261 | -0.362794 | 0.441878 | -0.177996 | 0.000117 | 1.000000 | 0.359285 |
poverty_pct | -0.155614 | -0.287675 | 0.299685 | 0.490632 | -0.795595 | 0.233832 | 0.359285 | 1.000000 |