# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import pandas as pd
import numpy as np
import wget
import os.path
import zipfile
from . import schools
from . import config
import shutil
urls = config.urls
[docs]def load_nyc_nysed():
"""
Load the subset set of the `load_nys_nysed` data for schools in the
New York City Department of Education school demographics data set.
"""
df = load_nys_nysed()
nyc = schools.load_school_demographics()
beds = nyc.beds.unique()
return df[df.beds.isin(beds)]
[docs]def load_nys_nysed():
"""
Load the grades 3-8 math and ela exam results for all schools and distrcicts
in New York State, in a long data format. This is the only data set that has
demographic test results for charter schools in NYC. This data has all of
the same columns as the NYC `load_math_ela_long`. Note that NYS does not
report the same demographic categories as New York City. In particular, the
two groups handle ENL students differently. NYC has 3 categories (current
ell, ever ell, never ell) where NYS only has ELL and Non-ELL.
The NYSED data includes categories that are not part of the NYC data such as
homelessness, foster care, and parents in armed services.
"""
try:
# try to load it locally to save time
feather = os.path.join(config.data_dir, "nysed-exams.feather")
return pd.read_feather(feather)
except FileNotFoundError:
return load_nysed_ela_math_archives()
[docs]def load_nysed_ela_math_archives(urls=urls["nysed_math_ela"].urls):
"""
Downloads all of the zip archives listed in the urls configuraiton. Extract
the archives into a temp folder and read them as dataframes. Concats the
dataframes into a single df and return.
"""
tmp = os.path.join(config.data_dir, "tmp")
if not os.path.exists(tmp):
os.mkdir(tmp)
# download and extract all of the .zip files from NYSED data portal
for url in urls:
download_and_extract(url, tmp)
# delete all of the files we don't care about from the temp dir
# and parse all of the ones we _do_ care about
files = os.listdir(tmp)
data = []
for f in files:
if f.endswith("mdb") or "RESEARCHER_FILE" not in f or f.startswith("."):
os.remove(os.path.join(tmp, f))
else:
data.append(os.path.join(tmp, f))
df = pd.concat([read_nysed_exam(d) for d in data])
df = df.reset_index()
# save as csv for outside/generic use
out = os.path.join(config.data_dir, "nysed-exams.csv")
df.to_csv(out, index=False)
# this is a big file, so save as feather for internal use
out = os.path.join(config.data_dir, "nysed-exams.feather")
df.to_feather(out)
# delete the temp folder
shutil.rmtree(tmp, ignore_errors=True)
return df
[docs]def download_and_extract(url, tmp):
"""Download and extract the .zip archives from NYSED"""
zippath = os.path.join(tmp, url.split("/")[-1])
wget.download(url, out=zippath)
zip = zipfile.ZipFile(zippath)
zip.extractall(path=tmp)
[docs]def fix_cols(df):
"""
Fixes the columns in the dataframe so that each NYSED year is consistent
with the other years' columns and column name/formats match other NYC DOE
columns in the data portal.
"""
nysed = df.copy()
# make sure capitalization is consistent
nysed.columns = [c.upper() for c in nysed.columns]
col_map = {
'NAME': 'school_name',
"BEDSCODE":"beds",
"SUBGROUP_NAME":"category",
"ITEM_SUBJECT_AREA":"exam",
"ITEM_DESC":"grade",
"SY_END_DATE":"test_year",
"TOTAL_TESTED":"number_tested",
'TOTAL_ENROLLED': 'total_enrollment',
'TOTAL_NOT_TESTED': 'number_not_tested',
'L1_COUNT':"level_1_n",
'L1_PCT':"level_1_pct",
'L2_COUNT':"level_2_n",
'L2_PCT':"level_2_pct",
'L3_COUNT':"level_3_n",
'L3_PCT':"level_3_pct",
'L4_COUNT':"level_4_n",
'L4_PCT':"level_4_pct",
'L3-L4_PCT':"level_3_4_pct",
'MEAN_SCALE_SCORE':"mean_scale_score",
'SUBGROUP_CODE': 'subgroup_code'
}
# rename columns to match NYC cols
def map_col(col):
if col in col_map:
return col_map[col]
else:
return col
nysed = nysed.rename(columns=map_col)
if "total_enrollment" not in nysed:
nysed["total_enrollment"] = 0
if "number_not_tested" not in nysed:
nysed["number_not_tested"] = 0
return nysed[list(col_map.values())]
[docs]def fix_data(df):
"""
Cleans data in the dataframe so that row-level data is
consistent across test years and matches NYC math/ela data sets:
- student categories are consistent
- counts and percents are consistent
- exam category is consistent
"""
nysed = df.copy()
try: # this for excel
nysed.test_year = nysed.test_year.apply(lambda x: x.date().year)
except: # or this for csv
nysed.test_year = nysed.test_year.apply(lambda x: int(x.split("/")[-1]))
cat_map = {
'English Language Learner': "Current ELL",
'Asian or Native Hawaiian/Other Pacific Islander': 'Asian',
'Hispanic or Latino': 'Hispanic',
'Black or African American': 'Black',
'Not Economically Disadvantaged': 'Not Econ Disadv',
'Economically Disadvantaged': 'Econ Disadv',
'Students with Disabilities': 'SWD',
'General Education Students': 'Not SWD',
'Non-English Language Learner': 'Never ELL'
}
def map_cats(cat):
if cat in cat_map:
return cat_map[cat]
return cat
# rename the student demo categories to match NYC
nysed.category = nysed.category.apply(map_cats)
# rename the exams to match NYC
nysed.exam = nysed.exam.map({"ELA":"ela","Mathematics":"math"})
# make counts into integers
count_cols = [c for c in nysed.columns if c.endswith("_n")]
# if we have these cols, add them too
for c in ["number_tested", "total_enrollment", "number_not_tested", "test_year"]:
if c in nysed:
count_cols.append(c)
for col in count_cols:
nysed[col] = pd.to_numeric(nysed[col], errors='coerce')
nysed[col] = nysed[col].fillna(0)
nysed[col] = nysed[col].astype(int)
nysed["ay"] = nysed.test_year - 1
nysed["level_3_4_n"] = nysed.level_3_n + nysed.level_4_n
# parse the grade leve string to just the grade int
nysed.grade = nysed.grade.apply(lambda x: int(x[6]))
# convert percents from 0-100 to 0-1 to match other data
def fix_pct(x):
if not x or x == "" or x =="-" or x == 0:
return 0
if hasattr(x, "endswith") and x.endswith("%"):
x = x[:-1]
try:
x = float(x)
if 0 <= x <= 1:
return x
x /= 100
return x
except:
print("couldn't fix", x)
return x
pct_cols = [c for c in nysed.columns if c.endswith("pct")]
for c in pct_cols:
nysed[c] = nysed[c].apply(fix_pct)
# nysed[c] = nysed[c].astype("float")
nysed["mean_scale_score"] = pd.to_numeric(nysed["mean_scale_score"], downcast='integer', errors='coerce')
return nysed
[docs]def read_nysed_exam(filename):
"""
Read the Excel file that has all test scores for all schools and districts
in New York State.
"""
try:
nysed = pd.read_csv(filename)
except:
nysed = pd.read_excel(filename)
nysed = fix_cols(nysed)
nysed = fix_data(nysed)
# all_grades = calc_all_grades(nysed.sample(500))
# nysed = nysed[cols]
# all_grades = all_grades[cols]
# nysed = nysed.append(all_grades, ignore_index=True)
return nysed
[docs]def map_nysed_nyc(nysed):
dbn_list = pd.read_feather("dbn-beds-list.feather", columns=["dbn","beds"])
dbn_list[dbn_list.beds.isin(nysed.beds)].drop_duplicates()
cols = list(nysed.columns)
# add dbn to nysed
combined = nysed.merge(dbn_list, on="beds", how="inner")
cols = ["dbn"] + cols
combined = combined[cols]
combined.to_feather("nyc-nysed-exams.feather")
return combined
[docs]def calc_all_grades(nysed):
"""
NYSED data doesn't include All Grades like NYC schools, so this function
adds an "All Grades" category with aggregate data for each school.
"""
# add all students category to nysed
def all_grades_for_school(row):
"""For each school, calculate the aggregate result for all grades"""
qry = f"beds == '{row.beds}' and ay == {row.ay} and category == '{row.category}' and exam=='{row.exam}'"
data = nysed.query(qry)
mean_scale_score = np.average(data.mean_scale_score, weights=data.total_enrollment)
num_tested = data.number_tested.sum()
levels = ["1","2","3","4","3_4"]
if num_tested == 0:
levels_n = [0] * len(levels)
levels_pct = [0] * len(levels)
else:
levels_n = [data[f"level_{i}_n"].sum() for i in levels]
levels_pct = [n / num_tested for n in levels_n ]
result = {
'ay': row.ay,
'exam': row.exam,
'grade': "All Grades",
'category': row.category,
'test_year': data.test_year.max(),
'mean_scale_score': mean_scale_score,
'number_tested': num_tested,
'number_not_tested': data.number_not_tested.sum(),
'level_1_n': levels_n[0],
'level_1_pct': levels_pct[0],
'level_2_n': levels_n[1],
'level_2_pct': levels_pct[1],
'level_3_n': levels_n[2],
'level_3_pct': levels_pct[2],
'level_4_n': levels_n[3],
'level_4_pct': levels_pct[3],
'level_3_4_n': levels_n[4],
'level_3_4_pct': levels_pct[2],
'beds': data.beds.max()
}
rows.append(result)
rows = []
cols = ["beds","ay","exam","category","test_year"]
t = nysed[cols]
t = t.drop_duplicates()
t.apply(all_grades_for_school, axis=1)
return pd.DataFrame(rows)