Source code for nycschools.dataloader

# NYC School Data
# Copyright (C) 2022. Matthew X. Curinga
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU AFFERO GENERAL PUBLIC LICENSE (the "License") as
# published by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the License for more details.
#
# You should have received a copy of the License along with this program.
# If not, see <http://www.gnu.org/licenses/>.
# ==============================================================================
import os
import os.path
import py7zr
import requests
import sys
import warnings
import platform
import subprocess

from yaspin import yaspin
from yaspin.spinners import Spinners

from . import config

[docs]def get_data_dir(): return config.data_dir
[docs]def download_data(): path = find_data_dir(config) if path: config.data_dir = path return path return download_archive(config.data_dir)
[docs]def find_data_dir(config): """ Tries to find an existing data directory populated with data, including searching through mounted google drive if the `colab` package is available and the g drive is mounted in the "standard" location of `/content/gdrive`. Returns: str: the path to the data directory """ env_dir = os.environ.get("NYC_SCHOOLS_DATA_DIR", None) local = os.path.join(".", "school-data") paths = [config.data_dir, env_dir, local] # first look locally for path in paths: if path and os.path.exists(path) and contains_data_files(path): config.data_dir = path return path # if not, try to mount a google drive path = mount_colab_data_dir() if path and contains_data_files(path): config.data_dir = path return path return None
[docs]def mount_colab_data_dir(): """Try to mount a google drive directory in colab and then search for the data directory by looking for a directory with the known name 'nyc-schools-data'.""" gdrive = "/content/gdrive" target = "nyc-schools-data" if not os.path.exists(gdrive): try: from google.colab import drive drive.mount(gdrive) except: return None # first check all of MyDrive for root, dirs, files in os.walk(f"{gdrive}/MyDrive"): if target in dirs: return os.path.join(root, f"{target}/data") # next check all of Shared Drives for root, dirs, files in os.walk(f"{gdrive}/Shareddrives"): if target in dirs: return os.path.join(root, f"{target}/data") return None
[docs]def contains_data_files(path): """ Checks to see if the specified path contains the data files required by this application. Parameters: path (str): the path to check Returns: bool: True if the path contains the data files required by this application, False otherwise """ files = set(os.listdir(path)) if len(files) == 0: return False expected = { "charter-ela.csv", "charter-math.csv", "nyc-ela.csv", "nyc-math.csv", "nysed-exams.csv", "nysed-exams.feather", "school-demographics.csv", "school_locations.geojson" } if expected.issubset(files): return True missing = expected.difference(files) if len(missing) == len(expected): return False warnings.warn(f"""Some data files are missing from the data directory. Found files: {files.intersection(expected)} Missing files: {missing} You can download the data files by running: `python -m nycschools.dataloader -d` For more information, see: https://adelphi-ed-tech.github.io/nycschools/""") return False
[docs]def download_archive(data_dir=None): """ Downloads the school data archive to the local drive and saves it into `data_dir` then extracts the .7z archive. `data_dir` now contains the cleaned and compiled school data files. Parameters: data_dir (str): the path to the directory where the data files should be saved. If not specified, the package configuration `data_dir` is used. Returns: str: the path to the downloaded file """ if not data_dir: print("no data dir") data_dir = config.data_dir else: config.data_dir = data_dir print("using data dir", data_dir) url = config.urls["school-data-archive"].url filename = config.urls["school-data-archive"].filename data_dir = os.path.abspath(data_dir) archive = os.path.join(data_dir, filename) resp = requests.get(url) with open(archive, "wb") as f: f.write(resp.content) with py7zr.SevenZipFile(archive, mode='r') as z: z.extractall(path=data_dir) os.remove(archive) return data_dir
[docs]def get_venv_activate(): """Finds the activation script for a running virtual environment or `None` if not running a venv.""" venv_path = os.environ.get('VIRTUAL_ENV') if venv_path: return os.path.join(venv_path, 'bin', 'activate') return None
[docs]def find_config_file(): """Looks for virtual environment activation scripts or bash configuration files in known locations. Returns: str: the path to the configuration file or `None` if not found """ paths = [ get_venv_activate(), os.path.expanduser("~/.bashrc"), os.path.expanduser("~/.bash_profile"), os.path.expanduser("~/.profile") ] for path in paths: if path and os.path.exists(path): return path warnings.warn(f"""Could not find a venv or bash configuration file to edit. You should manually set the NYC_SCHOOLS_DATA_DIR environment variable. See the full documentation at: {config.urls["docbook"].url}""") return None
[docs]def set_env_var(data_dir): """Attempts to set the NYC_SCHOOLS_DATA_DIR environment variable based on the user's platform.""" # This will return 'Windows', 'Linux', or 'Darwin' (for macOS) os_type = platform.system() if os_type == "Windows": subprocess.run(["setx", "NYC_SCHOOLS_DATA_DIR", data_dir]) elif os_type in ["Linux", "Darwin"]: config_path = find_config_file() if not config_path: return print("Writing env var to", config_path) with open(config_path, "a") as f: f.write(f"export NYC_SCHOOLS_DATA_DIR={data_dir}\n") print(f"""To access the data files in your current terminal session, you must run the following command: source {config_path} """)
[docs]def download_cache(): """Download the data archive and save it to the local drive. This interactive terminal program prompts the user for the location to save the data files. Once the files are downloaded and expanded it attempts to write the path to the data files into the python configuration environment. """ print("Default data location:", config.data_dir) data_dir = "" def prompt_data(): path = input("Enter the path to the data directory (or <enter> for default): ") # check if it exists, if not create it, catch errors and re-prompt try: os.makedirs(path, exist_ok=True) except: print(f"Could not create directory {path}") return prompt_data() # check if it's writeable if not os.access(path, os.W_OK): print(f"Cannot write to directory {path}") print("Either change file permissions or choose a different directory.") return prompt_data() return path data_dir = prompt_data() print(f"Downloading school data to {data_dir}") with yaspin(text="loading...", spinner=Spinners.bouncingBall) as sp: sp.side = "right" data_dir = download_archive(data_dir) sp.ok("✔") print(f"Data successfully saved to: {data_dir}") auto_config = input("Automatically set environment variable? [Y/n]: ") if auto_config.lower() == "y" or len(auto_config) == 0: set_env_var(data_dir) print("Environment variable set.") else: print("""You must configure the NYC_SCHOOLS_DATA_DIR environment variable. See the full documentation at: {config.urls["docbook"].url}""")
[docs]def main(): """Show the path to the `data_dir` where school data is stored. To use the interactive downloader, run `python -m nycschools.dataloader -d`. """ print(f"Current data directory: {config.data_dir}") # read args if exists if len(sys.argv) > 1: if sys.argv[1] == "-h" or sys.argv[1] == "--help": print("docs", main.__doc__) return elif sys.argv[1] == "-d" or sys.argv[1] == "--download": download_cache() return else: print("Unrecognized argument. Run `python -m nycschools.dataloader --help` for help.") return
if __name__ == "__main__": main()