# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path
import pandas as pd
from . import config
urls = config.urls
__class_size_file = os.path.join(config.data_dir, config.urls["class_size"].filename)
__ptr_file = os.path.join(config.data_dir, config.urls["class_size"].filename_ptr)
[docs]def load_class_size():
return pd.read_csv(__class_size_file)
[docs]def load_ptr():
return pd.read_csv(__ptr_file)
[docs]def get_class_22(url):
"""Read class size data for 2022 from
the DOE InfoHub Excel file format.
This data also contains pupil teacher ratios
which are saved in a separate file.
"""
xls = pd.read_excel(url, sheet_name=None)
# first sheet is k-8
k8 = xls["K-8 Avg"]
hs = xls["MS HS Avg"]
ptr = xls["PTR"]
ptr.columns = ['dbn', 'school_name', 'ptr']
ptr["ay"] = 2022
df = pd.concat([k8, hs], axis=0)
df["ay"] = 2022
df.columns = ['dbn', 'school_name', 'grade', 'program_type',
'students_n', 'classes_n', 'avg_class_size',
'min_class_size', 'max_class_size', 'dept', 'subject',
'ay']
return df, ptr
[docs]def get_class_size_year(ay, url):
def fix_rows(row):
if row.min_class_size == "<15":
row.min_class_size = row.students_n
if row.max_class_size == "<15":
row.max_class_size = row.students_n
if row.grade in ['K', 1, 2, 3, 4, 5,]:
row.subject = "Elementary"
row.dept = "Elementary"
return row
if (ay == 2022):
df, ptr = get_class_22(url)
ptr.to_csv(__ptr_file, index=False)
else:
df = pd.read_csv(url, dtype={'dbn': str})
df["ay"] = ay
df.rename(columns={
"number_of_students": "students_n",
"grade_level": "grade",
"number_of_classes":"classes_n",
"minimum_class_size":"min_class_size",
"maximum_class_size":"max_class_size",
"average_class_size":"avg_class_size"
}, inplace=True)
df = df.apply(fix_rows, axis=1)
return df
[docs]def get_class_size():
"""Get class size data from the web and cleans class
size data for each year that it is available
in the `datasets`. Currently data is available for each year from
2009-2021 excluding the 2020-2021 school year.
Returns
-------
DataFrame
a pandas DataFrame holding school demographic data for all of the schools
in the data portal
"""
years =[]
for ay, url in urls["class_size"].data_urls.items():
data = get_class_size_year(int(ay), url)
years.append(data)
df = pd.concat(years)
df.to_csv(__class_size_file, index=False)
return df