Source code for nycschools.class_size

# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path

import pandas as pd


from . import config
urls = config.urls
__class_size_file = os.path.join(config.data_dir, config.urls["class_size"].filename)
__ptr_file = os.path.join(config.data_dir, config.urls["class_size"].filename_ptr)

[docs]def load_class_size():
    return pd.read_csv(__class_size_file)

[docs]def load_ptr():
    return pd.read_csv(__ptr_file)

[docs]def get_class_22(url):
    """Read class size data for 2022 from
       the DOE InfoHub Excel file format.
       This data also contains pupil teacher ratios
       which are saved in a separate file.
    """

    xls = pd.read_excel(url, sheet_name=None)
    # first sheet is k-8
    k8 = xls["K-8 Avg"]
    hs = xls["MS HS Avg"]
    ptr = xls["PTR"]
    ptr.columns = ['dbn', 'school_name', 'ptr']
    ptr["ay"] = 2022

    df = pd.concat([k8, hs], axis=0)
    df["ay"] = 2022

    df.columns = ['dbn', 'school_name', 'grade', 'program_type',
                'students_n', 'classes_n', 'avg_class_size',
                'min_class_size', 'max_class_size', 'dept', 'subject',
                'ay']

    return df, ptr


[docs]def get_class_size_year(ay, url):
    def fix_rows(row):
        if row.min_class_size == "<15":
            row.min_class_size = row.students_n
        if row.max_class_size == "<15":
            row.max_class_size = row.students_n
        if row.grade in ['K', 1, 2, 3, 4, 5,]:
            row.subject = "Elementary"
            row.dept = "Elementary"
        return row

    if (ay == 2022):
        df, ptr = get_class_22(url)

        ptr.to_csv(__ptr_file, index=False)
    else:
        df = pd.read_csv(url, dtype={'dbn': str})
        df["ay"] = ay
        df.rename(columns={
            "number_of_students": "students_n", 
            "grade_level": "grade",
            "number_of_classes":"classes_n",
            "minimum_class_size":"min_class_size",
            "maximum_class_size":"max_class_size",
            "average_class_size":"avg_class_size"
            }, inplace=True)
        
    df = df.apply(fix_rows, axis=1)
    return df


[docs]def get_class_size():
    """Get class size data from the web and cleans class 
    size data for each year that it is available
    in the `datasets`. Currently data is available for each year from
    2009-2021 excluding the 2020-2021 school year.

    Returns
    -------
    DataFrame
        a pandas DataFrame holding school demographic data for all of the schools
        in the data portal

    """
    years =[]
    for ay, url in urls["class_size"].data_urls.items():
        data = get_class_size_year(int(ay), url)
        years.append(data)
    df = pd.concat(years)
    df.to_csv(__class_size_file, index=False)
    return df
NYC Schools Open Data Portal

Source code for nycschools.class_size