Source code for nycschools.budgets

# NYC School Data
# Copyright (C) 2023. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os.path
import os
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from . import config, schools

import dotenv
dotenv.load_dotenv()

__galaxyfile = os.path.join(config.data_dir, config.urls["galaxy"].filename)



[docs]def load_galaxy_budgets(): """Loads the galaxy budgets from the local cache. Parameters ---------- None Returns ------- data : pandas.DataFrame A single DataFrame that combines all of the budget data scraped from the web for all schools in the database.""" return pd.read_csv(__galaxyfile)
[docs]def open_webdriver(): """Opens a Selenium webdriver using the Chrome/Chromium engine for Selenium. If the environment variables `CHROME_PATH` and `CHROMEDRIVER_PATH` are set, they will be used to initialize the webdriver. Otherwise, the webdriver will be initialized using the default installation.""" # don't launch the browser GUI chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') # if the environment variables are set, use them chrome_path = os.environ.get("CHROME_PATH", None) chromedriver_path = os.environ.get("CHROMEDRIVER_PATH", None) if chrome_path: chrome_options.binary_location = chrome_path if chromedriver_path: s = Service(chromedriver_path) return webdriver.Chrome(service=s, options=chrome_options) return webdriver.Chrome(options=chrome_options)
def __fix_cols(data, sections, dbn): """normalizes column names from scraped data.""" col_map = { 'title': 'item', 'assignment': 'item', 'organizational category': 'item', 'total': 'item', 'total.1': 'item', 'grand total': 'item', 'type of class/service': 'service' } data = data.copy() for i, section in enumerate(sections): data[i]["category"] = section data[i].columns = [c.lower() for c in data[i].columns] data[i].rename(columns=col_map, inplace=True) school_col = [c for c in data[i].columns if c.lower().startswith(dbn.lower())] if len(school_col) > 0: data[i].rename(columns={school_col[0]: 'item'}, inplace=True) return data
[docs]def get_galaxy_summary(dbn, ay, driver): """Gets the 'galaxy summary' budget for a school from the DOE website. Parameters ---------- dbn : str The school's DBN. ay : int The school year, currently only the most recent school year (2022-2023) is available. driver : selenium.webdriver.chrome.webdriver.WebDriver The Selenium webdriver. Returns ------- data : pandas.DataFrame A single DataFrame that combines all of the budget data scraped from the web for the school specified by `dbn` and `ay`.""" url_stem = config.urls["galaxy"].url_stem url = f"{url_stem}{dbn[2:]}" driver.get(url) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') # use the section header as row category sections = [section.get_text().strip() for section in soup.select('.TO_Section')] data = pd.read_html(html) data = __fix_cols(data, sections, dbn) data = pd.concat(data) data["dbn"] = dbn data["ay"] = ay return data
[docs]def get_galaxy_budgets(): """Scrapes the 'galaxy summary' budget for all schools from the DOE website.""" df = schools.load_school_demographics() dbns = df[df.district < 33].dbn.unique() driver = open_webdriver() budgets = [] not_found = [] for dbn in dbns: try: budgets.append(get_galaxy_summary(dbn, 2022, driver)) except ValueError: not_found.append(dbn) except: print(f"Error scraping {dbn}") not_found.append(dbn + "*") data = pd.concat(budgets) data.item = data.item.str.lower() data.to_csv(__galaxyfile, index=False) return data, not_found