Basic Statistics#

This Notebook shows us how to use the pandas to find basic quantitative descriptions of our data

Topics in this Notebook:

  • dropping columns

  • minimum and maximum ranges

  • averages

  • counts

  • sorting data

  • correlations with corr()

  • describe()

# import schools from the nycschool package
from nycschools import schools

# load the demographic data into a `DataFrame` called df
df = schools.load_school_demographics()

# let's just use one year of data
df = df[df.ay == 2020]

# use a subset of columns for this notebook
cols = [
    'dbn',
    'district',
    'boro',
    'school_name',
    'total_enrollment',
    'asian_pct',
    'black_pct',
    'hispanic_pct',
    'white_pct',
    'swd_pct',
    'ell_pct',
    'poverty_pct'
]
df = df[cols]


df.head()
dbn district boro school_name total_enrollment asian_pct black_pct hispanic_pct white_pct swd_pct ell_pct poverty_pct
4 01M015 1 Manhattan P.S. 015 Roberto Clemente 193 0.135 0.275 0.528 0.057 0.223 0.109 0.819
9 01M019 1 Manhattan P.S. 019 Asher Levy 212 0.061 0.193 0.613 0.080 0.392 0.042 0.712
14 01M020 1 Manhattan P.S. 020 Anna Silver 412 0.248 0.133 0.522 0.073 0.218 0.119 0.709
19 01M034 1 Manhattan P.S. 034 Franklin D. Roosevelt 273 0.026 0.381 0.557 0.029 0.392 0.062 0.960
24 01M063 1 Manhattan The STAR Academy - P.S.63 208 0.029 0.192 0.635 0.091 0.279 0.014 0.769
# sort the data and show just the 10 largest schools
# sort in descending order (biggest --> smallest)
data = df.sort_values(by="total_enrollment", ascending=False)

# show the first 10 rows
data[:10]
dbn district boro school_name total_enrollment asian_pct black_pct hispanic_pct white_pct swd_pct ell_pct poverty_pct
3723 13K430 13 Brooklyn Brooklyn Technical High School 5921 0.599 0.056 0.066 0.237 0.016 0.001 0.590
5310 20K490 20 Brooklyn Fort Hamilton High School 4678 0.290 0.025 0.317 0.347 0.153 0.162 0.706
6686 26Q430 26 Queens Francis Lewis High School 4424 0.577 0.047 0.210 0.150 0.150 0.115 0.690
5802 22K405 22 Brooklyn Midwood High School 4109 0.354 0.255 0.133 0.225 0.138 0.036 0.723
5807 22K425 22 Brooklyn James Madison High School 3851 0.209 0.128 0.175 0.472 0.154 0.121 0.758
7279 28Q440 28 Queens Forest Hills High School 3775 0.247 0.065 0.381 0.272 0.164 0.093 0.702
8199 31R455 31 Staten Island Tottenville High School 3726 0.072 0.014 0.132 0.761 0.227 0.023 0.416
5557 21K525 21 Brooklyn Edward R. Murrow High School 3691 0.274 0.176 0.193 0.308 0.178 0.125 0.670
5300 20K445 20 Brooklyn New Utrecht High School 3572 0.391 0.030 0.336 0.234 0.165 0.218 0.796
6681 26Q415 26 Queens Benjamin N. Cardozo High School 3405 0.423 0.214 0.223 0.120 0.145 0.051 0.668
# get just the total_enrollment column, called a Series in pandas
enrollment = df["total_enrollment"]
print("The largest school:", enrollment.max())
print("The smallest school:", enrollment.min())

print("Avg (mean) school size:", enrollment.mean())
print("Avg (median) school size:", enrollment.median())
print("Avg (mode, can return multiple values) school size:", list(enrollment.mode()))
The largest school: 5921
The smallest school: 7
Avg (mean) school size: 560.79296875
Avg (median) school size: 460.5
Avg (mode, can return multiple values) school size: [479, 714]
# the built in describe() function calculates several descriptive statististics for each column
# in the data frame and returns them as a new dataframe
df.describe()
district total_enrollment asian_pct black_pct hispanic_pct white_pct swd_pct ell_pct poverty_pct
count 2048.00000 2048.000000 2048.000000 2048.000000 2048.000000 2048.000000 2048.000000 2048.000000 2048.000000
mean 27.62207 560.792969 0.117710 0.302644 0.431721 0.118602 0.237641 0.141212 0.766560
std 26.68098 461.014510 0.166804 0.256375 0.246118 0.168041 0.160780 0.143127 0.194297
min 1.00000 7.000000 0.000000 0.000000 0.015000 0.000000 0.000000 0.000000 0.050000
25% 10.00000 304.000000 0.015000 0.081474 0.206000 0.016000 0.163000 0.049000 0.711000
50% 19.00000 460.500000 0.046000 0.245000 0.405000 0.036000 0.208000 0.100000 0.827000
75% 30.00000 663.250000 0.148000 0.472250 0.631250 0.145250 0.261000 0.187250 0.903000
max 84.00000 5921.000000 0.926000 0.935000 1.000000 0.945000 1.000000 1.000000 0.960000
# we can also call describe aon a single series:
df.swd_pct.describe()
count    2048.000000
mean        0.237641
std         0.160780
min         0.000000
25%         0.163000
50%         0.208000
75%         0.261000
max         1.000000
Name: swd_pct, dtype: float64
# we can also call the corr() method to show correclations between columns
# we will take out "district" from this data because the district number
# is categorical -- not the measure of a value

# correlations close to 1 or negative one show high correlations
# closer to zero items are not closely correlated
data = df.drop(columns=["district"])
data.corr()
/tmp/ipykernel_4218/792112253.py:8: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  data.corr()
total_enrollment asian_pct black_pct hispanic_pct white_pct swd_pct ell_pct poverty_pct
total_enrollment 1.000000 0.350881 -0.242553 -0.111551 0.179056 -0.175962 -0.018582 -0.155614
asian_pct 0.350881 1.000000 -0.452366 -0.370264 0.210809 -0.210489 0.138261 -0.287675
black_pct -0.242553 -0.452366 1.000000 -0.416505 -0.450801 0.138008 -0.362794 0.299685
hispanic_pct -0.111551 -0.370264 -0.416505 1.000000 -0.396739 0.070421 0.441878 0.490632
white_pct 0.179056 0.210809 -0.450801 -0.396739 1.000000 -0.083534 -0.177996 -0.795595
swd_pct -0.175962 -0.210489 0.138008 0.070421 -0.083534 1.000000 0.000117 0.233832
ell_pct -0.018582 0.138261 -0.362794 0.441878 -0.177996 0.000117 1.000000 0.359285
poverty_pct -0.155614 -0.287675 0.299685 0.490632 -0.795595 0.233832 0.359285 1.000000
# last, we can use styles to make the correlation table easier to read
# note: you need to run this cell to see the colors -- it's get saved without the styled output
corr = data.corr()
# a coolwarm color map will show values in a gradient where -1 is the deepest blue and 1 is deepest red
corr.style.background_gradient(cmap='coolwarm')
/tmp/ipykernel_4218/143913075.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = data.corr()
  total_enrollment asian_pct black_pct hispanic_pct white_pct swd_pct ell_pct poverty_pct
total_enrollment 1.000000 0.350881 -0.242553 -0.111551 0.179056 -0.175962 -0.018582 -0.155614
asian_pct 0.350881 1.000000 -0.452366 -0.370264 0.210809 -0.210489 0.138261 -0.287675
black_pct -0.242553 -0.452366 1.000000 -0.416505 -0.450801 0.138008 -0.362794 0.299685
hispanic_pct -0.111551 -0.370264 -0.416505 1.000000 -0.396739 0.070421 0.441878 0.490632
white_pct 0.179056 0.210809 -0.450801 -0.396739 1.000000 -0.083534 -0.177996 -0.795595
swd_pct -0.175962 -0.210489 0.138008 0.070421 -0.083534 1.000000 0.000117 0.233832
ell_pct -0.018582 0.138261 -0.362794 0.441878 -0.177996 0.000117 1.000000 0.359285
poverty_pct -0.155614 -0.287675 0.299685 0.490632 -0.795595 0.233832 0.359285 1.000000