Source code for nycschools.schools

# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path
import math
import re

import pandas as pd
from thefuzz import fuzz

from . import geo
from . import config


__demo_filename = os.path.join(config.data_dir, config.urls["demographics"].filename)

[docs]class demo(): """The `demo` class bundles some common sets of column names to make it easier to work with the school demographic `DataFrame` Examples: ---------- from nycschools import schools df = schools.load_school_demographics() basic = df[schools.demo.short_cols] basic.head() """ raw_cols = [ 'dbn', 'school_name', 'year', 'total_enrollment', 'grade_3k_pk_half_day_full', 'grade_k', 'grade_1', 'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12', 'female', 'female_1', 'male', 'male_1', 'asian', 'asian_1', 'black', 'black_1', 'hispanic', 'hispanic_1', 'multi_racial', 'multi_racial_1', 'native_american', 'native_american_1', 'white', 'white_1', 'missing_race_ethnicity_data', 'missing_race_ethnicity_data_1', 'students_with_disabilities', 'students_with_disabilities_1', 'english_language_learners', 'english_language_learners_1', 'poverty', 'poverty_1', 'economic_need_index' ] default_cols = [ 'dbn', 'beds', 'district', "geo_district", 'boro', 'school_name', 'short_name', 'ay', 'year', 'school_type', 'total_enrollment', 'grade_3k_pk_half_day_full', 'grade_k', 'grade_1', 'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12', 'female_n', 'female_pct', 'male_n', 'male_pct', 'asian_n', 'asian_pct', 'black_n', 'black_pct', 'hispanic_n', 'hispanic_pct', 'multi_racial_n', 'multi_racial_pct', 'native_american_n', 'native_american_pct', 'white_n', 'white_pct', 'missing_race_ethnicity_data_n', 'missing_race_ethnicity_data_pct', 'swd_n', 'swd_pct', 'ell_n', 'ell_pct', 'poverty_n', 'poverty_pct', 'eni_pct', 'clean_name', 'zip' ] core_cols = [ 'dbn', 'beds', 'district', "geo_district", 'boro', 'school_name', 'ay', 'total_enrollment', 'female_n', 'female_pct', 'male_n', 'male_pct', 'asian_n', 'asian_pct', 'black_n', 'black_pct', 'hispanic_n', 'hispanic_pct', 'white_n', 'white_pct', 'swd_n', 'swd_pct', 'ell_n', 'ell_pct', 'poverty_n', 'poverty_pct', 'eni_pct' ] short_cols = ["dbn","school_name", "short_name", "clean_name", "ay"] default_map = { 'female':'female_n', 'female_1':'female_pct', 'male':'male_n', 'male_1':'male_pct', 'asian':'asian_n', 'asian_1':'asian_pct', 'black':'black_n', 'black_1':'black_pct', 'hispanic':'hispanic_n', 'hispanic_1':'hispanic_pct', 'multi_racial':'multi_racial_n', 'multi_racial_1':'multi_racial_pct', 'native_american':'native_american_n', 'native_american_1':'native_american_pct', 'white':'white_n', 'white_1':'white_pct', 'missing_race_ethnicity_data':'missing_race_ethnicity_data_n', 'missing_race_ethnicity_data_1':'missing_race_ethnicity_data_pct', 'students_with_disabilities':'swd_n', 'students_with_disabilities_1':'swd_pct', 'english_language_learners':'ell_n', 'english_language_learners_1':'ell_pct', 'poverty':'poverty_n', 'poverty_1':'poverty_pct', 'economic_need_index':'eni_pct' }
[docs]def str_pct(row, pct_col, enroll_col): """generic function to take percentages represented as Strings and convert to real expects data to look like '84.33%', 'Above 95%', 'Below 5%' Example: df["economic_need_index"] = df.apply(lambda row: str_pct(row, "economic_need_index", "total_enrollment"), axis = 1) """ pct = row[pct_col][:-1] # just call the population size `n` n = row[enroll_col] try: pct = float(pct) / 100 except: if "Above" in pct: pct = n * .96 / n elif "Below" in pct: pct = n * .04 / n else: print(f"Exception in {col}: {row['dbn']} - {row['year']}. Value: {pct}") return float(pct)
[docs]def str_count(row, col, enroll_col): """generic function to take whole number counts represented as Strings and convert to integers expects data to look like '246', 'Above 95%', 'Below 5%' Example: df["poverty"] = df.apply(lambda row: str_count(row, "poverty", "total_enrollment"), axis = 1) """ count = row[col] # just call the population size `n` n = row[enroll_col] try: return int(count) except: if "Above" in count: return int(math.ceil(n * .96)) elif "Below" in count: return int(math.floor(n * .04)) else: print(f"Exception in {col}: {row['dbn']} - {row['year']}. Value: {count}")
[docs]def school_type(school): """Any school that serves middle school kids is considered a middle school here.""" if school["middle"]: return "MS" if school["elementary"]: return "PS" if school["hs"]: return "HS" return "NA"
[docs]def clean_name(sn): """Creates a simplified school name that is easier to search/index """ sn = sn.lower() sn = sn.strip() sn = sn.replace(".", "") clean = [] for word in sn.split(" "): try: n = int(word) clean.append(str(n)) except: clean.append(word) sn = " ".join(clean) # ms, is, ps followed by a number p = re.compile(r"\b([m|p|i]s [0-9]*)") m = p.search(sn) if m: sn = sn.replace(m.group(0), "") sn = sn.strip() # if the only thing left is the boro, short name will be school num + boro if sn in ["bronx", "brooklyn", "manhattan", "queens", "staten island"]: sn = f"{m.group(0)} {sn}" return sn
[docs]def short_name(row): """Attempts to guess the common "short name" for a school. For example, the full name might be "P.S. 015 Roberto Clemente". This school is probably commonly referred to as PS 15.""" sn = row.school_name.upper() if "P.S." in sn or "P. S." in sn: return f"PS {row.school_num}" if "M.S." in sn or "M. S." in sn: return f"MS {row.school_num}" if "I. S." in sn or "I.S." in sn: return f"IS {row.school_num}" return f"{row.school_type} {row.school_num}"
[docs]def load_school_demographics(): """ Loads the NYC school-level demographic data from the open data portal and create a dataframe. Adds new columns to the data: short_name: the best guess for the common name of the school (e.g. PS 9) or "" if none exists district: the school district number [1..32, 75, 79, 84] boro: borough code string ["M","B","X","Q","R"] boro_name: string of the full borough name year: the academic year as an integer representing the calendar year in the fall of the school year white_asian: combined number of white and asian students non_white: total number of non white students non_white_asian: total number of non white or asian students black_hispanic: total number of black and hispanic students charter: 1 for charter schools, 0 for community public schools beds: the New York State BEDS code for the school zip: school zip code geo_district: the geographic school district where the school is located will differ for schools where district is > 32 Returns -------------- DataFrame a pandas DataFrame holding school demographic data for all of the schools in the data portal """ # try to load it locally to save time df = pd.read_csv(__demo_filename) df.zip = df.zip.fillna(0).astype("int32") df.beds = df.beds.fillna(0).astype("int64") return df
[docs]def save_demographics(url=config.urls["demographics"].url): """Loads and cleans school demographic data from a NYC Open Data Portal URL. The data is joined with some location data to make it more easily merged with other data sets. A local copy of the data is saved as a .csv in the `nycschools` data directory. Parameters ----------- url : str , default loads the url from config.urls with the 'demographics' key the URL to the most recent NYC school demographics Returns ------- DataFrame a pandas DataFrame holding school demographic data for all of the schools in the data portal """ df = pd.read_csv(url) boros = {"K":"Brooklyn", "X":"Bronx", "M": "Manhattan", "Q": "Queens", "R": "Staten Island"} df["ay"] = df["year"].apply(lambda year: int(year.split("-")[0])) df["district"] = df["dbn"].apply(lambda dbn: int(dbn[:2])) df["boro"] = df["dbn"].apply(lambda dbn: boros[dbn[2]]) df["school_num"] = df.dbn.apply(lambda dbn: int(dbn[3:])) df["charter"] = df.district.apply(lambda x: 1 if x == 84 else 0) # figure out what grades they teach df["pk"] = df["grade_3k_pk_half_day_full"] > 0 df["elementary"] = df["grade_2"] > 0 df["middle"] = df["grade_7"] > 0 df["hs"] = df["grade_10"] > 0 # make it easier to look up schools df["school_type"] = df.apply(school_type, axis=1) df["clean_name"] = df.apply(lambda row: clean_name(row.school_name), axis=1) df["short_name"] = df.apply(short_name, axis=1) df["poverty"] = df.apply(lambda row: str_count(row, "poverty", "total_enrollment"), axis = 1) df["poverty_1"] = df.apply(lambda row: str_pct(row, "poverty_1", "total_enrollment"), axis = 1) df["economic_need_index"] = df.apply(lambda row: str_pct(row, "economic_need_index", "total_enrollment"), axis = 1) df = df.rename(columns=demo.default_map) # calculate school type based on district number df["school_type"] = None df.loc[(df.district > 0) & (df.district < 33), "school_type"] = "community" df.loc[df.district == 84, "school_type"] = "charter" df.loc[df.district == 75, "school_type"] = "d75" df.loc[df.district == 79, "school_type"] = "alternative" df = join_loc_data(df) df = df[demo.default_cols] df.to_csv(__demo_filename, index=False) return df
[docs]def join_loc_data(df): """Join NYS BEDS id, zip code, and other location data. """ school_loc = geo.load_school_locations() # with the left join, we might be missing zip and beds for some schools df = df.merge(school_loc[["dbn", "beds", "zip", "geo_district"]], on="dbn", how="left") # df.beds = df.beds.fillna(0) df.geo_district = df.geo_district.fillna(0) df.geo_district = df.geo_district.astype(int) # df.zip = df.zip.fillna(0).astype("int32") df.beds = pd.to_numeric(df.beds , downcast='integer', errors='coerce') df.beds = df.beds.fillna(0).astype("int64") # df.beds = df.beds.fillna(0).astype("int64") def get_geo(row): d = row.district if row.geo_district > 0: return row.geo_district if d > 0 and d <=32: return d return 0 df.geo_district = df.apply(get_geo, axis=1) return df
[docs]def load_hs_directory(ay=2021): """Loads the NYC High School Directory data from the NYC Open Data Portal. This is a thin wrapper around `pd.read_csv()` and requires an internet connection. By default, the most recent directory is returned. For a different academic year, pass the `ay` parameter. Data is available for academic years 2013-2021. The data varies greatly from year to year, so they are not compiled into a single DataFrame. Parameters ----------- ay : int , default 2021 the academic year for the directory data Returns -------- DataFrame a pandas DataFrame holding the high school directory data """ urls = config.urls["hs_directory"].data_urls ay = str(ay) if ay not in urls.keys(): raise ValueError(f"No data for academic year: {ay}") url = urls[ay] df = pd.read_csv(url) df["ay"] = int(ay) return df