Source code for nycschools.budgets

# NYC School Data
# Copyright (C) 2023. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path
import os
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from . import config, schools

import dotenv
dotenv.load_dotenv()

__galaxyfile = os.path.join(config.data_dir, config.urls["galaxy"].filename)



[docs]def load_galaxy_budgets():
    """Loads the galaxy budgets from the local cache.
    
    Parameters
    ----------
    None
    
    Returns
    -------
    data : pandas.DataFrame
        A single DataFrame that combines all of the budget data scraped from the web
        for all schools in the database."""   

    return pd.read_csv(__galaxyfile)

[docs]def open_webdriver():
    """Opens a Selenium webdriver using the Chrome/Chromium 
engine for Selenium. If the environment variables `CHROME_PATH` 
and `CHROMEDRIVER_PATH` are set, they will be used to 
initialize the webdriver. Otherwise, the webdriver will be 
initialized using the default installation."""

    # don't launch the browser GUI
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')

    # if the environment variables are set, use them
    chrome_path = os.environ.get("CHROME_PATH", None)
    chromedriver_path = os.environ.get("CHROMEDRIVER_PATH", None)

    if chrome_path:
        chrome_options.binary_location = chrome_path
    if chromedriver_path:
        s = Service(chromedriver_path)
        return webdriver.Chrome(service=s, options=chrome_options)

    return webdriver.Chrome(options=chrome_options)

def __fix_cols(data, sections, dbn):
    """normalizes column names from scraped data."""

    col_map = {
        'title': 'item',
        'assignment': 'item',
        'organizational category': 'item',
        'total': 'item',
        'total.1': 'item',
        'grand total': 'item',
        'type of class/service': 'service'
    }


    data = data.copy()
    
    for i, section in enumerate(sections):
        data[i]["category"] = section
        data[i].columns = [c.lower() for c in data[i].columns]
        data[i].rename(columns=col_map, inplace=True)
        school_col = [c for c in data[i].columns if c.lower().startswith(dbn.lower())]
        if len(school_col) > 0:
            data[i].rename(columns={school_col[0]: 'item'}, inplace=True)
    return data


[docs]def get_galaxy_summary(dbn, ay, driver):
    """Gets the 'galaxy summary' budget for a school from the DOE website.
    
    Parameters
    ----------
    dbn : str
        The school's DBN.
    ay : int
        The school year, currently only the most recent school year (2022-2023) is available.
    driver : selenium.webdriver.chrome.webdriver.WebDriver
        The Selenium webdriver.
        
    Returns
    -------
    data : pandas.DataFrame
        A single DataFrame that combines all of the budget data scraped from the web
        for the school specified by `dbn` and `ay`."""

    url_stem = config.urls["galaxy"].url_stem
    url = f"{url_stem}{dbn[2:]}"

    driver.get(url)
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    # use the section header as row category
    sections = [section.get_text().strip() for section in soup.select('.TO_Section')]
    data = pd.read_html(html)
    data = __fix_cols(data, sections, dbn)
    data = pd.concat(data)
    data["dbn"] = dbn
    data["ay"] = ay

    return data

[docs]def get_galaxy_budgets():
    """Scrapes the 'galaxy summary' budget for all schools from the DOE website."""
    df = schools.load_school_demographics()
    dbns = df[df.district < 33].dbn.unique()
    driver = open_webdriver()
    budgets = []
    not_found = []
    for dbn in dbns:
        try:
            budgets.append(get_galaxy_summary(dbn, 2022, driver))
        except ValueError:
            not_found.append(dbn)
        except:
            print(f"Error scraping {dbn}")
            not_found.append(dbn + "*")

    data = pd.concat(budgets)
    data.item = data.item.str.lower()
    data.to_csv(__galaxyfile, index=False)

    return data, not_found
NYC Schools Open Data Portal

Source code for nycschools.budgets