Source code for nycschools.schools

# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path
import math
import re

import pandas as pd
from thefuzz import fuzz

from . import geo
from . import config


__demo_filename = os.path.join(config.data_dir, config.urls["demographics"].filename)

[docs]class demo():
    """The `demo` class bundles some common sets of column names
    to make it easier to work with the school demographic `DataFrame`

    Examples:
    ----------
    from nycschools import schools

    df = schools.load_school_demographics()
    basic = df[schools.demo.short_cols]
    basic.head()
    """
    raw_cols = [
       'dbn', 'school_name', 'year', 'total_enrollment',
       'grade_3k_pk_half_day_full', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'female', 'female_1', 'male',
       'male_1', 'asian', 'asian_1', 'black', 'black_1', 'hispanic',
       'hispanic_1', 'multi_racial', 'multi_racial_1', 'native_american',
       'native_american_1', 'white', 'white_1', 'missing_race_ethnicity_data',
       'missing_race_ethnicity_data_1', 'students_with_disabilities',
       'students_with_disabilities_1', 'english_language_learners',
       'english_language_learners_1', 'poverty', 'poverty_1',
       'economic_need_index'
    ]

    default_cols = [
        'dbn',
        'beds',
        'district',
        "geo_district",
        'boro',
        'school_name',
        'short_name',
        'ay',
        'year',
        'school_type',
        'total_enrollment',
        'grade_3k_pk_half_day_full',
        'grade_k',
        'grade_1',
        'grade_2',
        'grade_3',
        'grade_4',
        'grade_5',
        'grade_6',
        'grade_7',
        'grade_8',
        'grade_9',
        'grade_10',
        'grade_11',
        'grade_12',
        'female_n',
        'female_pct',
        'male_n',
        'male_pct',
        'asian_n',
        'asian_pct',
        'black_n',
        'black_pct',
        'hispanic_n',
        'hispanic_pct',
        'multi_racial_n',
        'multi_racial_pct',
        'native_american_n',
        'native_american_pct',
        'white_n',
        'white_pct',
        'missing_race_ethnicity_data_n',
        'missing_race_ethnicity_data_pct',
        'swd_n',
        'swd_pct',
        'ell_n',
        'ell_pct',
        'poverty_n',
        'poverty_pct',
        'eni_pct',
        'clean_name',
        'zip'
    ]

    core_cols = [
        'dbn',
        'beds',
        'district',
        "geo_district",
        'boro',
        'school_name',
        'ay',
        'total_enrollment',
        'female_n',
        'female_pct',
        'male_n',
        'male_pct',
        'asian_n',
        'asian_pct',
        'black_n',
        'black_pct',
        'hispanic_n',
        'hispanic_pct',
        'white_n',
        'white_pct',
        'swd_n',
        'swd_pct',
        'ell_n',
        'ell_pct',
        'poverty_n',
        'poverty_pct',
        'eni_pct'
    ]

    short_cols = ["dbn","school_name", "short_name", "clean_name", "ay"]

    default_map = {
        'female':'female_n',
        'female_1':'female_pct',
        'male':'male_n',
        'male_1':'male_pct',
        'asian':'asian_n',
        'asian_1':'asian_pct',
        'black':'black_n',
        'black_1':'black_pct',
        'hispanic':'hispanic_n',
        'hispanic_1':'hispanic_pct',
        'multi_racial':'multi_racial_n',
        'multi_racial_1':'multi_racial_pct',
        'native_american':'native_american_n',
        'native_american_1':'native_american_pct',
        'white':'white_n',
        'white_1':'white_pct',
        'missing_race_ethnicity_data':'missing_race_ethnicity_data_n',
        'missing_race_ethnicity_data_1':'missing_race_ethnicity_data_pct',
        'students_with_disabilities':'swd_n',
        'students_with_disabilities_1':'swd_pct',
        'english_language_learners':'ell_n',
        'english_language_learners_1':'ell_pct',
        'poverty':'poverty_n',
        'poverty_1':'poverty_pct',
        'economic_need_index':'eni_pct'
    }

[docs]def str_pct(row, pct_col, enroll_col):
    """generic function to take percentages represented as Strings and convert to real
    expects data to look like '84.33%', 'Above 95%', 'Below 5%'
    Example: df["economic_need_index"] = df.apply(lambda row: str_pct(row, "economic_need_index", "total_enrollment"), axis = 1)
    """
    pct = row[pct_col][:-1]
    # just call the population size `n`
    n = row[enroll_col]

    try:
        pct = float(pct) / 100
    except:
        if "Above" in pct:
            pct = n * .96 / n
        elif "Below" in pct:
            pct = n * .04 / n
        else:
            print(f"Exception in {col}: {row['dbn']} - {row['year']}. Value: {pct}")

    return float(pct)

[docs]def str_count(row, col, enroll_col):
    """generic function to take whole number counts represented as Strings and convert to integers
    expects data to look like '246', 'Above 95%', 'Below 5%'
    Example: df["poverty"] = df.apply(lambda row: str_count(row, "poverty", "total_enrollment"), axis = 1)
    """
    count = row[col]
    # just call the population size `n`
    n = row[enroll_col]
    try:
        return int(count)
    except:
        if "Above" in count:
            return int(math.ceil(n * .96))
        elif "Below" in count:
            return int(math.floor(n * .04))
        else:
            print(f"Exception in {col}: {row['dbn']} - {row['year']}. Value: {count}")

[docs]def school_type(school):
    """Any school that serves middle school kids is considered a middle school here."""

    if school["middle"]:
        return "MS"
    if school["elementary"]:
        return "PS"
    if school["hs"]:
        return "HS"
    return "NA"


[docs]def clean_name(sn):
    """Creates a simplified school name that is easier to search/index

    """
    sn = sn.lower()
    sn = sn.strip()
    sn = sn.replace(".", "")
    clean = []
    for word in sn.split(" "):
        try:
            n = int(word)
            clean.append(str(n))
        except:
            clean.append(word)

    sn = " ".join(clean)

    # ms, is, ps followed by a number
    p = re.compile(r"\b([m|p|i]s [0-9]*)")
    m = p.search(sn)
    if m:
        sn = sn.replace(m.group(0), "")
    sn = sn.strip()
    # if the only thing left is the boro, short name will be school num + boro
    if sn in ["bronx", "brooklyn", "manhattan", "queens", "staten island"]:
        sn = f"{m.group(0)} {sn}"
    return sn


[docs]def short_name(row):
    """Attempts to guess the common "short name" for a school.
For example, the full name might be "P.S. 015 Roberto Clemente".
This school is probably commonly referred to as PS 15."""
    sn = row.school_name.upper()
    if "P.S." in sn or "P. S." in sn:
        return f"PS {row.school_num}"
    if "M.S." in sn or "M. S." in sn:
        return f"MS {row.school_num}"
    if "I. S." in sn or "I.S." in sn:
        return f"IS {row.school_num}"

    return f"{row.school_type} {row.school_num}"


[docs]def load_school_demographics():
    """ Loads the NYC school-level demographic data from the
    open data portal and create a dataframe.

    Adds new columns to the data:

           short_name: the best guess for the common name of the school (e.g. PS 9)
                       or "" if none exists
             district: the school district number [1..32, 75, 79, 84]
                 boro: borough code string ["M","B","X","Q","R"]
            boro_name: string of the full borough name
                 year: the academic year as an integer representing the calendar
                       year in the fall of the school year
          white_asian: combined number of white and asian students
            non_white: total number of non white students
      non_white_asian: total number of non white or asian students
       black_hispanic: total number of black and hispanic students
              charter: 1 for charter schools, 0 for community public schools
                 beds: the New York State BEDS code for the school
                  zip: school zip code
         geo_district: the geographic school district where the school is located
                       will differ for schools where district is > 32

    Returns
    --------------
    DataFrame
        a pandas DataFrame holding school demographic data for all of the schools
        in the data portal

    """

    # try to load it locally to save time
    df = pd.read_csv(__demo_filename)
    df.zip = df.zip.fillna(0).astype("int32")
    df.beds = df.beds.fillna(0).astype("int64")
    return df

[docs]def save_demographics(url=config.urls["demographics"].url):
    """Loads and cleans school demographic data from a NYC Open Data
    Portal URL. The data is joined with some location data to make it
    more easily merged with other data sets. A local copy of the data
    is saved as a .csv in the `nycschools` data directory.

    Parameters
    -----------
    url : str , default loads the url from config.urls with the 'demographics' key
          the URL to the most recent NYC school demographics

    Returns
    -------
    DataFrame
        a pandas DataFrame holding school demographic data for all of the schools
        in the data portal

    """

    df = pd.read_csv(url)

    boros = {"K":"Brooklyn", "X":"Bronx", "M": "Manhattan", "Q": "Queens", "R": "Staten Island"}

    df["ay"] = df["year"].apply(lambda year: int(year.split("-")[0]))
    df["district"] = df["dbn"].apply(lambda dbn: int(dbn[:2]))

    df["boro"] = df["dbn"].apply(lambda dbn: boros[dbn[2]])
    df["school_num"] = df.dbn.apply(lambda dbn: int(dbn[3:]))
    df["charter"] = df.district.apply(lambda x: 1 if x == 84 else 0)

    # figure out what grades they teach
    df["pk"] = df["grade_3k_pk_half_day_full"] > 0
    df["elementary"] = df["grade_2"] > 0
    df["middle"] = df["grade_7"] > 0
    df["hs"] = df["grade_10"] > 0

    # make it easier to look up schools
    df["school_type"] = df.apply(school_type, axis=1)
    df["clean_name"] = df.apply(lambda row: clean_name(row.school_name), axis=1)
    df["short_name"] = df.apply(short_name, axis=1)

    df["poverty"] = df.apply(lambda row: str_count(row, "poverty", "total_enrollment"), axis = 1)
    df["poverty_1"] = df.apply(lambda row: str_pct(row, "poverty_1", "total_enrollment"), axis = 1)
    df["economic_need_index"] = df.apply(lambda row: str_pct(row, "economic_need_index", "total_enrollment"), axis = 1)
    
    df = df.rename(columns=demo.default_map)
    
    # calculate school type based on district number
    df["school_type"] = None
    df.loc[(df.district > 0)  & (df.district < 33), "school_type"] = "community"
    df.loc[df.district == 84, "school_type"] = "charter"
    df.loc[df.district == 75, "school_type"] = "d75"
    df.loc[df.district == 79, "school_type"] = "alternative"
    df = join_loc_data(df)
    df = df[demo.default_cols]
    df.to_csv(__demo_filename, index=False)

    return df


[docs]def join_loc_data(df):
    """Join NYS BEDS id, zip code, and other location data.

    """
    school_loc = geo.load_school_locations()
    # with the left join, we might be missing zip and beds for some schools
    df = df.merge(school_loc[["dbn", "beds", "zip", "geo_district"]], on="dbn", how="left")
    # df.beds = df.beds.fillna(0)
    df.geo_district = df.geo_district.fillna(0)
    df.geo_district = df.geo_district.astype(int)
    # df.zip = df.zip.fillna(0).astype("int32")
    df.beds = pd.to_numeric(df.beds , downcast='integer', errors='coerce')
    df.beds = df.beds.fillna(0).astype("int64")

    # df.beds = df.beds.fillna(0).astype("int64")

    def get_geo(row):
        d = row.district
        if row.geo_district > 0:
            return row.geo_district
        if d > 0 and d <=32:
            return d
        return 0
    df.geo_district = df.apply(get_geo, axis=1)
    return df

[docs]def search(df, qry):
    """Search a DataFrame for a school using fuzzy logic.


    Parameters
    ----------
    df : DataFrame
         a pandas DataFrame containing school data, such as the
         data returned from `load_school_demographics()`
    qry : the school name or search term to look for in the data set

    Returns
    ---------
    DataFrame
        the schools that match the `qry` or an empty DataFrame
        if no matches were found

    """
    t = df.copy(deep=False)
    latest = t.ay.max()
    # just check the lastest year
    t = t[t["ay"] == latest]

    # see if there's an exact name match
    q = clean_name(qry)
    results = t.query(f"clean_name == '{q}'")
    if len(q) > 0 and len(results) > 0:
        return results

    # look it up by number
    results = t.query(f"short_name == '{qry.upper()}'")
    if len(results) > 0:
        return results

    # fuzzy search -- add a fuzzy search match column
    t["match"] = t.school_name.apply(lambda sn: fuzz.token_set_ratio(qry, sn))
    t = t.query("match > 80")
    t = pd.DataFrame(t)
    # now sort the results based on the ratio match
    t["match"] = t.clean_name.apply(lambda sn: fuzz.ratio(qry, sn))
    results = t.sort_values(by=["match"], ascending=False)

    return results


[docs]def load_hs_directory(ay=2021):
    """Loads the NYC High School Directory data from the NYC Open Data
    Portal. This is a thin wrapper around `pd.read_csv()` and
    requires an internet connection. By default, the most recent directory
    is returned. For a different academic year, pass the `ay` parameter.
    Data is available for academic years 2013-2021. The data varies greatly
    from year to year, so they are not compiled into a single DataFrame.

    Parameters
    -----------
    ay : int , default 2021
         the academic year for the directory data

    Returns
    --------
    DataFrame
        a pandas DataFrame holding the high school directory data

    """
    urls = config.urls["hs_directory"].data_urls
    ay = str(ay)

    if ay not in urls.keys():
        raise ValueError(f"No data for academic year: {ay}")
    url = urls[ay]
    df = pd.read_csv(url)
    df["ay"] = int(ay)
    return df
NYC Schools Open Data Portal

Source code for nycschools.schools