Basic Statistics

# import schools from the nycschool package
from nycschools import schools

# load the demographic data into a `DataFrame` called df
df = schools.load_school_demographics()

# let's just use one year of data
df = df[df.ay == 2020]

# use a subset of columns for this notebook
cols = [
    'dbn',
    'district',
    'boro',
    'school_name',
    'total_enrollment',
    'asian_pct',
    'black_pct',
    'hispanic_pct',
    'white_pct',
    'swd_pct',
    'ell_pct',
    'poverty_pct'
]
df = df[cols]


df.head()

	dbn	district	boro	school_name	total_enrollment	asian_pct	black_pct	hispanic_pct	white_pct	swd_pct	ell_pct	poverty_pct
4	01M015	1	Manhattan	P.S. 015 Roberto Clemente	193	0.135	0.275	0.528	0.057	0.223	0.109	0.819
9	01M019	1	Manhattan	P.S. 019 Asher Levy	212	0.061	0.193	0.613	0.080	0.392	0.042	0.712
14	01M020	1	Manhattan	P.S. 020 Anna Silver	412	0.248	0.133	0.522	0.073	0.218	0.119	0.709
19	01M034	1	Manhattan	P.S. 034 Franklin D. Roosevelt	273	0.026	0.381	0.557	0.029	0.392	0.062	0.960
24	01M063	1	Manhattan	The STAR Academy - P.S.63	208	0.029	0.192	0.635	0.091	0.279	0.014	0.769

# sort the data and show just the 10 largest schools
# sort in descending order (biggest --> smallest)
data = df.sort_values(by="total_enrollment", ascending=False)

# show the first 10 rows
data[:10]

	dbn	district	boro	school_name	total_enrollment	asian_pct	black_pct	hispanic_pct	white_pct	swd_pct	ell_pct	poverty_pct
3723	13K430	13	Brooklyn	Brooklyn Technical High School	5921	0.599	0.056	0.066	0.237	0.016	0.001	0.590
5310	20K490	20	Brooklyn	Fort Hamilton High School	4678	0.290	0.025	0.317	0.347	0.153	0.162	0.706
6686	26Q430	26	Queens	Francis Lewis High School	4424	0.577	0.047	0.210	0.150	0.150	0.115	0.690
5802	22K405	22	Brooklyn	Midwood High School	4109	0.354	0.255	0.133	0.225	0.138	0.036	0.723
5807	22K425	22	Brooklyn	James Madison High School	3851	0.209	0.128	0.175	0.472	0.154	0.121	0.758
7279	28Q440	28	Queens	Forest Hills High School	3775	0.247	0.065	0.381	0.272	0.164	0.093	0.702
8199	31R455	31	Staten Island	Tottenville High School	3726	0.072	0.014	0.132	0.761	0.227	0.023	0.416
5557	21K525	21	Brooklyn	Edward R. Murrow High School	3691	0.274	0.176	0.193	0.308	0.178	0.125	0.670
5300	20K445	20	Brooklyn	New Utrecht High School	3572	0.391	0.030	0.336	0.234	0.165	0.218	0.796
6681	26Q415	26	Queens	Benjamin N. Cardozo High School	3405	0.423	0.214	0.223	0.120	0.145	0.051	0.668

# get just the total_enrollment column, called a Series in pandas
enrollment = df["total_enrollment"]
print("The largest school:", enrollment.max())
print("The smallest school:", enrollment.min())

print("Avg (mean) school size:", enrollment.mean())
print("Avg (median) school size:", enrollment.median())
print("Avg (mode, can return multiple values) school size:", list(enrollment.mode()))

The largest school: 5921
The smallest school: 7
Avg (mean) school size: 560.79296875
Avg (median) school size: 460.5
Avg (mode, can return multiple values) school size: [479, 714]

# the built in describe() function calculates several descriptive statististics for each column
# in the data frame and returns them as a new dataframe
df.describe()

	district	total_enrollment	asian_pct	black_pct	hispanic_pct	white_pct	swd_pct	ell_pct	poverty_pct
count	2048.00000	2048.000000	2048.000000	2048.000000	2048.000000	2048.000000	2048.000000	2048.000000	2048.000000
mean	27.62207	560.792969	0.117710	0.302644	0.431721	0.118602	0.237641	0.141212	0.766560
std	26.68098	461.014510	0.166804	0.256375	0.246118	0.168041	0.160780	0.143127	0.194297
min	1.00000	7.000000	0.000000	0.000000	0.015000	0.000000	0.000000	0.000000	0.050000
25%	10.00000	304.000000	0.015000	0.081474	0.206000	0.016000	0.163000	0.049000	0.711000
50%	19.00000	460.500000	0.046000	0.245000	0.405000	0.036000	0.208000	0.100000	0.827000
75%	30.00000	663.250000	0.148000	0.472250	0.631250	0.145250	0.261000	0.187250	0.903000
max	84.00000	5921.000000	0.926000	0.935000	1.000000	0.945000	1.000000	1.000000	0.960000

# we can also call describe aon a single series:
df.swd_pct.describe()

count    2048.000000
mean        0.237641
std         0.160780
min         0.000000
25%         0.163000
50%         0.208000
75%         0.261000
max         1.000000
Name: swd_pct, dtype: float64

# we can also call the corr() method to show correclations between columns
# we will take out "district" from this data because the district number
# is categorical -- not the measure of a value

# correlations close to 1 or negative one show high correlations
# closer to zero items are not closely correlated
data = df.drop(columns=["district"])
data.corr()

/tmp/ipykernel_4218/792112253.py:8: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  data.corr()

	total_enrollment	asian_pct	black_pct	hispanic_pct	white_pct	swd_pct	ell_pct	poverty_pct
total_enrollment	1.000000	0.350881	-0.242553	-0.111551	0.179056	-0.175962	-0.018582	-0.155614
asian_pct	0.350881	1.000000	-0.452366	-0.370264	0.210809	-0.210489	0.138261	-0.287675
black_pct	-0.242553	-0.452366	1.000000	-0.416505	-0.450801	0.138008	-0.362794	0.299685
hispanic_pct	-0.111551	-0.370264	-0.416505	1.000000	-0.396739	0.070421	0.441878	0.490632
white_pct	0.179056	0.210809	-0.450801	-0.396739	1.000000	-0.083534	-0.177996	-0.795595
swd_pct	-0.175962	-0.210489	0.138008	0.070421	-0.083534	1.000000	0.000117	0.233832
ell_pct	-0.018582	0.138261	-0.362794	0.441878	-0.177996	0.000117	1.000000	0.359285
poverty_pct	-0.155614	-0.287675	0.299685	0.490632	-0.795595	0.233832	0.359285	1.000000

# last, we can use styles to make the correlation table easier to read
# note: you need to run this cell to see the colors -- it's get saved without the styled output
corr = data.corr()
# a coolwarm color map will show values in a gradient where -1 is the deepest blue and 1 is deepest red
corr.style.background_gradient(cmap='coolwarm')

/tmp/ipykernel_4218/143913075.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = data.corr()

	total_enrollment	asian_pct	black_pct	hispanic_pct	white_pct	swd_pct	ell_pct	poverty_pct
total_enrollment	1.000000	0.350881	-0.242553	-0.111551	0.179056	-0.175962	-0.018582	-0.155614
asian_pct	0.350881	1.000000	-0.452366	-0.370264	0.210809	-0.210489	0.138261	-0.287675
black_pct	-0.242553	-0.452366	1.000000	-0.416505	-0.450801	0.138008	-0.362794	0.299685
hispanic_pct	-0.111551	-0.370264	-0.416505	1.000000	-0.396739	0.070421	0.441878	0.490632
white_pct	0.179056	0.210809	-0.450801	-0.396739	1.000000	-0.083534	-0.177996	-0.795595
swd_pct	-0.175962	-0.210489	0.138008	0.070421	-0.083534	1.000000	0.000117	0.233832
ell_pct	-0.018582	0.138261	-0.362794	0.441878	-0.177996	0.000117	1.000000	0.359285
poverty_pct	-0.155614	-0.287675	0.299685	0.490632	-0.795595	0.233832	0.359285	1.000000

NYC Schools Open Data Portal

Basic Statistics#