feat: final push for submission

2025-12-06 03:59:17 +00:00
parent a6e6a51041
commit a2b6b6fedd
17 changed files with 644 additions and 48 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 data/
 .ipynb_checkpoints/
+__pycache__/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,48 @@
+# Educational Statistics Analysis Tool
+
+Python utility to parse data from the Department of Education and National Science Foundation.
+
+## Description
+
+This project was created to answer the question "If I have $5, should I give it to the art department or to the history department?"
+
+This project does so by gathering historical enrollment data from the College Scorecard by the Department of Education, indexable by year, institutiton, and subject, and comparing it against the National Science Foundation's data on funding, which provides data on subject funding indexable by year, institution, subject, and funding source.
+
+## Getting Started
+
+### Dependencies
+
+* This project was built against Python 3.12
+* Some scripts require excessive computational memory. An HPC Cluster or other server environment is recommended.
+
+### Executing program
+
+* Due to concerns with license restrictions on data, I have opted not to redistribute any raw or modified data files with this project. Instead, data must be downloaded from the DoE and NSF directly. This was automated with the `download.sh` file.
+* The `.csv` files downloaded from the College Scorecard must be converted into an Awkward Array for effective data analysis. This is done with `convert.py`. **This step requires 80GB of RAM**, and as such, must be done on a machine with sufficient memory.
+* Final plotting can be done with `run.py`. Sample outputs are provided for user convenience in `output.txt` (for tabular outputs) and in the `plots/` directory.
+
+## Help Wanted!
+
+A traditional repository would have the Issues tab in their git host acting as their feature request / help wanted section. However, I've not tested the Issues tab (as I'm also the only one who uses this git server instance), so a simple list will suffice:
+
+- Implement cross-comparison of subject fields with the best available subject equivilent in the funding column
+- Implement mass-scraping of the NSF site to gather funding data for all institutions across all years, rather than just 2025 data for UA
+- Modify `convert.py` to accept the CSVs as Dask DataFrames rather than as Pandas tables, to allow for distributed and/or delayed computing of the conversion. This would allow for people with ordinary laptops to run the program
+
+## Authors
+
+* Nathan Nguyen
+
+## Version History
+
+* 0.1
+    * Initial Release
+
+## License
+
+I'm not a lawyer - most of this data probably has restrictions, anyways. If I was, though, I'd use the [NAME HERE](https://www.youtube.com/watch?v=XfELJU1mRMg) License - see the LICENSE.md file for details
+
+## Acknowledgments
+
+Inspiration, code snippets, etc.
+* [ReadMe Template](https://gist.githubusercontent.com/DomPizzie/7a5ff55ffa9081f2de27c315f5018afc/raw/d59043abbb123089ad6602aba571121b71d91d7f/README-Template.md)
--- a/convert.py
+++ b/convert.py
@@ -1,29 +1,47 @@
 import os
 import yaml
+import logging

 import awkward as ak
 import pandas as pd
 import numpy as np
 import tqdm

-scorecard_dir = "data/scorecard"
-scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
+import utils

-print("Loading data.yaml")
+
+
+
+if __name__ == "__main__":
+    # Setup Args
+    parser = utils.get_common_args(prog="CSV -> Pandas Scorecard Converter")
+    args = parser.parse_args()
+
+    # Setup Logging
+    utils.setup_logging(args.debug)
+    logger = logging.getLogger("CSVPandasConverter")
+
+    scorecard_dir = os.path.join(args.data_dir, "scorecard")
+    scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
+    logger.info(f"Loading College Scorecard data from directory {scorecard_dir}")
+
+    logger.debug("Loading metadata from data.yaml")
    with open(os.path.join(scorecard_dir, 'data.yaml'), 'r') as file:
        data = yaml.safe_load(file)

-print("Loading CSVs to dataframes")
+    logger.info("Loading all CSV files as Pandas dataframes")
    files = [f'MERGED{i}_{(i + 1) % 100:02}_PP.csv' for i in tqdm.trange(1996, 2024)]
    dataframes = [pd.read_csv(os.path.join(scorecard_dir, file)) for file in tqdm.tqdm(files)]

-print("Appending extra rows where needed")
+    logger.info("Creating list of all UNITIDs across all files")
    unit_ids = np.unique(np.hstack([frame.UNITID.to_numpy() for frame in tqdm.tqdm(dataframes)]))
+
+    logger.info("Appending extra columns to each year's dataframes to prepare for appending")
    for i, frame in tqdm.tqdm(enumerate(dataframes)):
        new_rows = pd.DataFrame({"UNITID": unit_ids[~np.isin(unit_ids, frame.UNITID)]})
        dataframes[i] = pd.concat([frame, new_rows]).sort_values(by=["UNITID", "OPEID"])

-print("Converting to Results Array")
+    logger.info("Converting to Results Array")
    result = {}
    for key, sec in tqdm.tqdm(data['dictionary'].items()):
        if 'calculate' in sec:
@@ -49,12 +67,12 @@ for key, sec in tqdm.tqdm(data['dictionary'].items()):
            obj = obj.astype(str)
        section[parts[-1]] = obj

-print("Cleanup: Deleting Dataframes from Memory")
+    logger.info("Cleanup: Deleting Dataframes from Memory")
    del dataframes # Memory cleanup

-print("Converting to Awkward Array")
+    logger.info("Converting to Awkward Array")
    a = ak.Array(result)
    del result # Memory cleanup
    
-print("Writing to Disk")
+    logger.info("Writing to Disk")
    ak.to_parquet(a, os.path.join(scorecard_dir, "merged.parquet"))
--- a/excel_tools.py
+++ b/excel_tools.py
@@ -0,0 +1,31 @@
+import pandas as pd
+
+
+# Keys
+key_all = "All R&D expenditures"
+key_federal = "Federal government"
+key_state = "State and local government"
+key_inst = "Institution funds"
+key_business = "Business"
+key_nonprofit = "Nonprofit organizations"
+key_other = "All other sources"
+keys = [key_federal, key_state, key_inst, key_business, key_nonprofit, key_other]
+percent_suffix = " Percent"
+
+
+def read_excel(file):
+    # Read excel
+    df = pd.read_excel(file, header=3)
+    
+    # Remove fluff
+    df = df[:-4][[field for field in df[:-4] if 'Unnamed' not in field]]
+
+    # Add percent
+    for key in keys:
+        key_percent = key + percent_suffix
+        df[key_percent] = df[key] / df[key_all] * 100
+
+    return df
+
+def print_data(data):
+    print(data)
--- a/explore.ipynb
+++ b/explore.ipynb
@@ -30,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 1,
   "id": "eeee645e-48b5-4bdf-a386-e3354c8ac46a",
   "metadata": {},
   "outputs": [],
@@ -45,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "e18b60a6-be26-473b-9938-c9d18c28b8aa",
   "metadata": {},
   "outputs": [],
@@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 3,
   "id": "45b02475-f4cb-4830-8367-05ef5eefaaee",
   "metadata": {},
   "outputs": [],
@@ -73,14 +73,16 @@
  {
   "cell_type": "markdown",
   "id": "6129bee1-fadb-42cd-a146-c6c7a5217bf9",
-   "metadata": {},
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
   "source": [
    "# National Stats"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 6,
   "id": "05ec674e-60b3-491c-b056-b7518fbb451c",
   "metadata": {
    "scrolled": true
--- a/output.txt
+++ b/output.txt
@@ -0,0 +1,223 @@
+2025-12-04 15:17:57,143 - DataAnalysis             - INFO    - Loading College Scorecard data from file data/scorecard/College_Scorecard_Raw_Data_05192025/merged.parquet (run.py:33)
+2025-12-04 15:18:20,065 - DataAnalysis             - INFO    - Saving scorecard-only statistics (run.py:38)
+2025-12-04 15:18:23,197 - Scorecard Data           - INFO    - 
+
+National Percentage (Last Academic Year):  (scorecard_tools.py:135)
+2025-12-04 15:18:23,209 - Scorecard Data           - INFO    - 
+
+National Percentage (Avg):  (scorecard_tools.py:138)
+2025-12-04 15:18:23,493 - Scorecard Data           - INFO    - 
+
+Program Percentage (The University of Alabama, Last Academic Year):  (scorecard_tools.py:158)
+2025-12-04 15:18:23,504 - Scorecard Data           - INFO    - 
+
+Program Percentage (The University of Alabama, Avg):  (scorecard_tools.py:161)
+/opt/conda/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
+  warn("Workbook contains no default style, apply openpyxl's default")
+                                    Name  Percent (Last)  Percent (Avg)  Students (Last)  Students (Avg)
+35                                health       22.149643      12.027962      5960.320075             NaN
+36                    business_marketing       14.511786      17.399856      3653.717364             NaN
+15                            humanities       10.181786       9.546438      3624.366286             NaN
+8                              education        7.426786       6.705871      2231.885475             NaN
+6                               computer        7.148571       2.808726      2416.331129             NaN
+31            mechanic_repair_technology        6.220357       0.811160      1382.265304             NaN
+27              security_law_enforcement        4.961071       3.019295      1735.563754             NaN
+10                engineering_technology        3.409286       1.377941       962.291996             NaN
+20                       multidiscipline        2.957857       1.964365       981.708071             NaN
+12               family_consumer_science        2.880357       1.222658       799.266364             NaN
+32                  precision_production        2.701429       0.379921      1043.298893             NaN
+17                            biological        2.671786       4.908001       913.827739             NaN
+29                        social_science        2.312143       6.253486       567.835936             NaN
+28  public_administration_social_service        1.677500       1.601816       531.436800             NaN
+26                            psychology        1.285714       5.195744       529.515436             NaN
+4                          communication        1.136786       3.192944       456.121346             NaN
+9                            engineering        0.972143       2.803155       333.565925             NaN
+34                     visual_performing        0.807500       3.903754       287.056654             NaN
+7                      personal_culinary        0.698571       0.444786       246.205414             NaN
+11                              language        0.633929       0.951896       224.890446             NaN
+18                           mathematics        0.611071       1.048517       205.602029             NaN
+21              parks_recreation_fitness        0.593214       1.885363       225.476271             NaN
+30                          construction        0.573571       0.317400       232.164054             NaN
+13                                 legal        0.529286       0.366860       151.130189             NaN
+24                      physical_science        0.421429       1.598245       125.394396             NaN
+14                               english        0.237500       2.671756        91.474121             NaN
+37                               history        0.208929       1.670524        76.740832             NaN
+22                  philosophy_religious        0.072500       0.677424        26.515154             NaN
+3                 ethnic_cultural_gender        0.007500       0.368425         1.937561             NaN
+1                              resources        0.000000       0.700875         0.000000             NaN
+5              communications_technology        0.000000       0.215724         0.000000             NaN
+2                           architecture        0.000000       0.243856         0.000000             NaN
+0                            agriculture        0.000000       0.819862         0.000000             NaN
+16                               library        0.000000       0.012911         0.000000             NaN
+19                              military        0.000000       0.035292         0.000000             NaN
+25                    science_technology        0.000000       0.080646         0.000000             NaN
+23           theology_religious_vocation        0.000000       0.432991         0.000000             NaN
+33                        transportation        0.000000       0.331288         0.000000             NaN
+                                    Name  Percent (Last)  Percent (Avg)  Students (Last)  Students (Avg)
+36                    business_marketing       14.511786      17.399856      3653.717364             NaN
+35                                health       22.149643      12.027962      5960.320075             NaN
+15                            humanities       10.181786       9.546438      3624.366286             NaN
+8                              education        7.426786       6.705871      2231.885475             NaN
+29                        social_science        2.312143       6.253486       567.835936             NaN
+26                            psychology        1.285714       5.195744       529.515436             NaN
+17                            biological        2.671786       4.908001       913.827739             NaN
+34                     visual_performing        0.807500       3.903754       287.056654             NaN
+4                          communication        1.136786       3.192944       456.121346             NaN
+27              security_law_enforcement        4.961071       3.019295      1735.563754             NaN
+6                               computer        7.148571       2.808726      2416.331129             NaN
+9                            engineering        0.972143       2.803155       333.565925             NaN
+14                               english        0.237500       2.671756        91.474121             NaN
+20                       multidiscipline        2.957857       1.964365       981.708071             NaN
+21              parks_recreation_fitness        0.593214       1.885363       225.476271             NaN
+37                               history        0.208929       1.670524        76.740832             NaN
+28  public_administration_social_service        1.677500       1.601816       531.436800             NaN
+24                      physical_science        0.421429       1.598245       125.394396             NaN
+10                engineering_technology        3.409286       1.377941       962.291996             NaN
+12               family_consumer_science        2.880357       1.222658       799.266364             NaN
+18                           mathematics        0.611071       1.048517       205.602029             NaN
+11                              language        0.633929       0.951896       224.890446             NaN
+0                            agriculture        0.000000       0.819862         0.000000             NaN
+31            mechanic_repair_technology        6.220357       0.811160      1382.265304             NaN
+1                              resources        0.000000       0.700875         0.000000             NaN
+22                  philosophy_religious        0.072500       0.677424        26.515154             NaN
+7                      personal_culinary        0.698571       0.444786       246.205414             NaN
+23           theology_religious_vocation        0.000000       0.432991         0.000000             NaN
+32                  precision_production        2.701429       0.379921      1043.298893             NaN
+3                 ethnic_cultural_gender        0.007500       0.368425         1.937561             NaN
+13                                 legal        0.529286       0.366860       151.130189             NaN
+33                        transportation        0.000000       0.331288         0.000000             NaN
+30                          construction        0.573571       0.317400       232.164054             NaN
+2                           architecture        0.000000       0.243856         0.000000             NaN
+5              communications_technology        0.000000       0.215724         0.000000             NaN
+25                    science_technology        0.000000       0.080646         0.000000             NaN
+19                              military        0.000000       0.035292         0.000000             NaN
+16                               library        0.000000       0.012911         0.000000             NaN
+                                    Name  Percent (Last)  Percent (Avg)  Students (Last)  Students (Avg)
+36                    business_marketing           30.11      28.734643       10662.8543     7409.063086
+9                            engineering            9.66       8.218571        3420.8958     2251.159975
+35                                health            9.66       9.055000        3420.8958     2404.925375
+4                          communication            9.42      10.607143        3335.9046     2703.289471
+29                        social_science            7.99       4.308214        2829.4987     1147.649796
+12               family_consumer_science            6.96       7.195714        2464.7448     1848.827150
+26                            psychology            5.13       3.813929        1816.6869      978.664311
+17                            biological            3.89       3.384643        1377.5657      878.928314
+21              parks_recreation_fitness            2.86       0.308929        1012.8118      107.747239
+8                              education            2.72       7.219643         963.2336     1749.198443
+34                     visual_performing            2.17       3.532143         768.4621      860.670914
+20                       multidiscipline            1.72       2.466786         609.1036      591.093271
+6                               computer            1.44       0.937500         509.9472      248.493907
+37                               history            1.17       1.755714         414.3321      421.027957
+28  public_administration_social_service            1.09       1.165357         386.0017      286.223114
+24                      physical_science            1.06       0.981071         375.3778      256.140975
+14                               english            0.97       1.992857         343.5061      458.707082
+18                           mathematics            0.89       0.571786         315.1757      164.741461
+11                              language            0.45       0.713929         159.3585      173.877018
+1                              resources            0.38       0.251786         134.5694       77.018761
+22                  philosophy_religious            0.18       0.357143          63.7434       89.032021
+3                 ethnic_cultural_gender            0.09       0.299286          31.8717       73.607264
+5              communications_technology            0.00       0.000000           0.0000        0.000000
+2                           architecture            0.00       0.017500           0.0000        2.880346
+0                            agriculture            0.00       0.000000           0.0000        0.000000
+16                               library            0.00       0.000000           0.0000        0.000000
+10                engineering_technology            0.00       0.000000           0.0000        0.000000
+7                      personal_culinary            0.00       0.000000           0.0000        0.000000
+15                            humanities            0.00       0.001429           0.0000        0.228757
+13                                 legal            0.00       0.000000           0.0000        0.000000
+19                              military            0.00       0.000000           0.0000        0.000000
+27              security_law_enforcement            0.00       2.109286           0.0000      521.904729
+23           theology_religious_vocation            0.00       0.000000           0.0000        0.000000
+25                    science_technology            0.00       0.000000           0.0000        0.000000
+33                        transportation            0.00       0.000000           0.0000        0.000000
+32                  precision_production            0.00       0.000000           0.0000        0.000000
+31            mechanic_repair_technology            0.00       0.000000           0.0000        0.000000
+30                          construction            0.00       0.000000           0.0000        0.000000
+                                    Name  Percent (Last)  Percent (Avg)  Students (Last)  Students (Avg)
+36                    business_marketing           30.11      28.734643       10662.8543     7409.063086
+4                          communication            9.42      10.607143        3335.9046     2703.289471
+35                                health            9.66       9.055000        3420.8958     2404.925375
+9                            engineering            9.66       8.218571        3420.8958     2251.159975
+8                              education            2.72       7.219643         963.2336     1749.198443
+12               family_consumer_science            6.96       7.195714        2464.7448     1848.827150
+29                        social_science            7.99       4.308214        2829.4987     1147.649796
+26                            psychology            5.13       3.813929        1816.6869      978.664311
+34                     visual_performing            2.17       3.532143         768.4621      860.670914
+17                            biological            3.89       3.384643        1377.5657      878.928314
+20                       multidiscipline            1.72       2.466786         609.1036      591.093271
+27              security_law_enforcement            0.00       2.109286           0.0000      521.904729
+14                               english            0.97       1.992857         343.5061      458.707082
+37                               history            1.17       1.755714         414.3321      421.027957
+28  public_administration_social_service            1.09       1.165357         386.0017      286.223114
+24                      physical_science            1.06       0.981071         375.3778      256.140975
+6                               computer            1.44       0.937500         509.9472      248.493907
+11                              language            0.45       0.713929         159.3585      173.877018
+18                           mathematics            0.89       0.571786         315.1757      164.741461
+22                  philosophy_religious            0.18       0.357143          63.7434       89.032021
+21              parks_recreation_fitness            2.86       0.308929        1012.8118      107.747239
+3                 ethnic_cultural_gender            0.09       0.299286          31.8717       73.607264
+1                              resources            0.38       0.251786         134.5694       77.018761
+2                           architecture            0.00       0.017500           0.0000        2.880346
+15                            humanities            0.00       0.001429           0.0000        0.228757
+0                            agriculture            0.00       0.000000           0.0000        0.000000
+5              communications_technology            0.00       0.000000           0.0000        0.000000
+7                      personal_culinary            0.00       0.000000           0.0000        0.000000
+10                engineering_technology            0.00       0.000000           0.0000        0.000000
+13                                 legal            0.00       0.000000           0.0000        0.000000
+19                              military            0.00       0.000000           0.0000        0.000000
+16                               library            0.00       0.000000           0.0000        0.000000
+23           theology_religious_vocation            0.00       0.000000           0.0000        0.000000
+25                    science_technology            0.00       0.000000           0.0000        0.000000
+33                        transportation            0.00       0.000000           0.0000        0.000000
+32                  precision_production            0.00       0.000000           0.0000        0.000000
+31            mechanic_repair_technology            0.00       0.000000           0.0000        0.000000
+30                          construction            0.00       0.000000           0.0000        0.000000
+                                                Field  All R&D expenditures  ...  Nonprofit organizations Percent  All other sources Percent
+0                                      All R&D fields              184880.0  ...                         0.948183                   0.376460
+1                                             Science               92834.0  ...                         0.613999                   0.187431
+2                   Computer and information sciences               18813.0  ...                         0.015946                   0.212619
+3   Geosciences, atmospheric sciences, and ocean s...               16822.0  ...                         0.606349                   0.095114
+4                 Atmospheric science and meteorology                   0.0  ...                              NaN                        NaN
+5                       Geological and earth sciences               14248.0  ...                         0.715890                   0.112296
+6                  Ocean sciences and marine sciences                2574.0  ...                         0.000000                   0.000000
+7   Geosciences, atmospheric sciences, and ocean s...                   0.0  ...                              NaN                        NaN
+8                                       Life sciences               23853.0  ...                         1.224165                   0.314426
+9                               Agricultural sciences                  11.0  ...                         0.000000                   0.000000
+10                 Biological and biomedical sciences               10055.0  ...                         0.616609                   0.159125
+11                                    Health sciences               13736.0  ...                         1.674432                   0.422248
+12                 Natural resources and conservation                   0.0  ...                              NaN                        NaN
+13                                  Life sciences nec                  51.0  ...                         0.000000                   1.960784
+14                         Mathematics and statistics                2918.0  ...                         0.959561                   0.000000
+15                                  Physical sciences               11124.0  ...                         0.107875                   0.000000
+16                         Astronomy and astrophysics                 271.0  ...                         0.000000                   0.000000
+17                                          Chemistry                7938.0  ...                         0.151172                   0.000000
+18                                  Materials science                   0.0  ...                              NaN                        NaN
+19                                            Physics                2911.0  ...                         0.000000                   0.000000
+20                              Physical sciences nec                   4.0  ...                         0.000000                   0.000000
+21                                         Psychology                5690.0  ...                         0.351494                   0.000000
+22                                    Social sciences               13350.0  ...                         0.846442                   0.322097
+23                                       Anthropology                 792.0  ...                         1.767677                   5.429293
+24                                          Economics                3707.0  ...                         0.000000                   0.000000
+25                   Political science and government                 898.0  ...                         1.559020                   0.000000
+26      Sociology, demography, and population studies                2908.0  ...                         0.000000                   0.000000
+27                                Social sciences nec                5045.0  ...                         1.684836                   0.000000
+28                                       Sciences nec                 264.0  ...                         0.000000                   0.000000
+29                                        Engineering               51029.0  ...                         1.312979                   0.850497
+30  Aerospace, aeronautical, and astronautical eng...                4665.0  ...                         0.000000                   0.000000
+31          Bioengineering and biomedical engineering                   0.0  ...                              NaN                        NaN
+32                               Chemical engineering                6647.0  ...                         2.888521                   0.000000
+33                                  Civil engineering               15198.0  ...                         0.782998                   1.796289
+34  Electrical, electronic, and communications eng...               10299.0  ...                         3.359549                   0.854452
+35           Industrial and manufacturing engineering                   0.0  ...                              NaN                        NaN
+36                             Mechanical engineering                7056.0  ...                         0.000000                   0.255102
+37            Metallurgical and materials engineering                6473.0  ...                         0.000000                   0.849683
+38                                    Engineering nec                 691.0  ...                         1.881331                   0.000000
+39                                            Non-S&E               41017.0  ...                         1.250701                   0.214545
+40    Business management and business administration               11919.0  ...                         0.092290                   0.000000
+41      Communication and communications technologies                2675.0  ...                         0.523364                   0.074766
+42                                          Education                9731.0  ...                         1.459254                   0.000000
+43                                         Humanities                4813.0  ...                         1.059630                   1.225847
+44                                                Law                2076.0  ...                         9.633911                   0.000000
+45                                        Social work                4557.0  ...                         0.987492                   0.285275
+46                         Visual and performing arts                2803.0  ...                         0.356761                   0.499465
+47                                        Non-S&E nec                2443.0  ...                         1.637331                   0.000000
+
+[48 rows x 14 columns]
--- a/Alabama-enrollment-percent.png
+++ b/Alabama-enrollment-percent.png
--- a/Alabama-enrollment-students.png
+++ b/Alabama-enrollment-students.png
--- a/Alabama-enrollment.png
+++ b/Alabama-enrollment.png
--- a/plots/dist-students-enrolled-avg.png
+++ b/plots/dist-students-enrolled-avg.png
--- a/plots/dist-students-enrolled-last-year.png
+++ b/plots/dist-students-enrolled-last-year.png
--- a/plots/national-enrollment-percent.png
+++ b/plots/national-enrollment-percent.png
--- a/plots/national-enrollment-students.png
+++ b/plots/national-enrollment-students.png
--- a/plots/national-enrollment.png
+++ b/plots/national-enrollment.png
--- a/run.py
+++ b/run.py
@@ -0,0 +1,50 @@
+import os
+import logging
+
+import awkward as ak
+
+import utils
+import scorecard_tools
+import excel_tools
+
+
+if __name__ == "__main__":
+    # Setup Args
+    parser = utils.get_common_args(prog="Main Data Parser")
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        default="plots",
+        help="Directory to save generated plots",
+    )
+    parser.add_argument("-s", "--school", default="The University of Alabama")
+    
+    args = parser.parse_args()
+
+    # Setup Logging
+    utils.setup_logging(args.debug)
+    logger = logging.getLogger("DataAnalysis")
+
+    # Load Scorecard
+    scorecard_dir = os.path.join(args.data_dir, "scorecard")
+    scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
+    scorecard_file = os.path.join(scorecard_dir, "merged.parquet")
+
+    logger.info(f"Loading College Scorecard data from file {scorecard_file}")
+    scorecard_data = ak.from_parquet(scorecard_file)
+    
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    logger.info("Saving scorecard-only statistics")
+    scorecard_data = scorecard_tools.mask_valid(scorecard_data)
+    scorecard_tools.save_scorecard_stats(args.output_dir, scorecard_data)
+    scorecard_tools.save_scorecard_stats_national(args.output_dir, scorecard_data)
+    scorecard_tools.save_scorecard_stats_school(args.output_dir, scorecard_data, args.school)
+
+    # Print
+    scorecard_tools.print_enrollment_national(scorecard_data)
+    scorecard_tools.print_enrollment(scorecard_data, args.school)
+
+    # Load Excel
+    excel_data = excel_tools.read_excel(os.path.join(args.data_dir, "ua.xlsx"))
+    excel_tools.print_data(excel_data)
--- a/scorecard_tools.py
+++ b/scorecard_tools.py
@@ -0,0 +1,162 @@
+import os
+import logging
+
+import awkward as ak
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+logger = logging.getLogger("Scorecard Data")
+
+
+def mask_valid(data):
+    # Strip out NaN and zero placeholders
+    mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
+    mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
+    mask = mask_nan & mask_all_zero
+    data = data[mask]
+
+    return data
+
+def mask_school(data, name):
+    return data[data.school.name[:, -1] == name][0]
+
+
+# Percentage enrollment per field over time averaged across the nation
+def save_scorecard_stats_national(output_dir, data, ext="png"):
+    prog_percentage = data.academics.program_percentage
+    students = data.student.enrollment.undergrad_12_month
+
+    # Percent enrollment
+    for key in prog_percentage.fields:
+        # Find the mean across the nation
+        mean = ak.mean(prog_percentage[key], axis=0)
+        if ak.all(mean < 0.05):
+            continue
+        plt.plot(mean * 100, label=key)
+    plt.xlabel("Year")
+    plt.ylabel("Percent Enrollment")
+    plt.title("Average Enrollment (National)")
+    plt.legend()
+    plt.savefig(os.path.join(output_dir, f"national-enrollment-percent.{ext}"))
+    plt.clf()
+
+    # Students enrolled
+    for key in prog_percentage.fields:
+        # Find the mean across the nation
+        mean_perc = ak.mean(prog_percentage[key], axis=0)
+        if ak.all(mean_perc < 0.05):
+            continue
+            
+        mean = ak.mean(prog_percentage[key] * students, axis=0)
+        plt.plot(mean, label=key)
+    plt.xlabel("Year")
+    plt.ylabel("Percent Enrollment")
+    plt.title("Average Enrollment (National)")
+    plt.legend()
+    plt.savefig(os.path.join(output_dir, f"national-enrollment-students.{ext}"))
+    plt.clf()
+
+    
+# Percentage enrollment per field over time averaged across one school
+def save_scorecard_stats_school(output_dir, data, school: str, ext="png"):
+    data = mask_school(data, school)
+    prog_percentage = data.academics.program_percentage
+    students = data.student.enrollment.undergrad_12_month
+
+    # Plot percent enrollment
+    for key in prog_percentage.fields:
+        # Find the mean across the nation
+        mean = ak.mean(prog_percentage[key], axis=0)
+        if mean < 0.05:
+            continue
+        plt.plot(prog_percentage[key] * 100, label=key)
+    plt.xlabel("Year")
+    plt.ylabel("Percent Enrollment")
+    plt.title(f"Average Enrollment ({school})")
+    plt.legend()
+    plt.savefig(os.path.join(output_dir, f"{school}-enrollment-percent.{ext}"))
+    plt.clf()
+
+    # Plot student enrollment
+    for key in prog_percentage.fields:
+        # Find the mean across the nation
+        mean = ak.mean(prog_percentage[key], axis=0)
+        if mean < 0.05:
+            continue
+        plt.plot(prog_percentage[key] * students, label=key)
+    plt.xlabel("Year")
+    plt.ylabel("Percent Enrollment")
+    plt.title(f"Average Enrollment ({school})")
+    plt.legend()
+    plt.savefig(os.path.join(output_dir, f"{school}-enrollment-students.{ext}"))
+    plt.clf()
+
+def save_scorecard_stats(output_dir, data, ext="png"):
+    # Students Enrolled Distribution (Average)
+    avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
+    plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
+    plt.xlabel("Students Enrolled")
+    plt.ylabel("Univeristy Count")
+    plt.title("Enrollemnt Distribution (Average)")
+    plt.yscale("log")
+    plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-avg.{ext}"))
+    plt.clf()
+
+    # Students Enrolled Distribution (Last Year)
+    plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
+    plt.xlabel("Students Enrolled")
+    plt.ylabel("Univeristy Count")
+    plt.title("Enrollemnt Distribution (Last Year)")
+    plt.yscale("log")
+    plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-last-year.{ext}"))
+    plt.clf()
+
+
+
+# Print national enrollment stats
+def print_enrollment_national(data):
+    prog_percentage = data.academics.program_percentage
+    students = data.student.enrollment.undergrad_12_month
+    
+    # Create dataframe of enrollment across the nation (latest academic year and average)
+    df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
+    for key in df['Name']:
+        vals = prog_percentage[key]
+        df['Percent (Last)'].append(ak.mean(vals[-1]) * 100)
+        df['Percent (Avg)'].append(ak.mean(vals) * 100)
+        df['Students (Last)'].append(ak.mean(vals[-1] * students[-1]))
+        df['Students (Avg)'].append(ak.mean(vals * students))
+        
+    df = pd.DataFrame(df)
+
+    # Print Dataframe
+    logger.info("\n\nNational Percentage (Last Academic Year): ")
+    print(df.sort_values("Percent (Last)", ascending=False))
+    
+    logger.info("\n\nNational Percentage (Avg): ")
+    print(df.sort_values("Percent (Avg)", ascending=False))
+
+def print_enrollment(data, name: str):
+    data = mask_school(data, name)
+    prog_percentage = data.academics.program_percentage
+    students = data.student.enrollment.undergrad_12_month
+    
+    # Create dataframe of enrollment across the nation (latest academic year and average)
+    df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
+    for key in df['Name']:
+        vals = prog_percentage[key]
+        df['Percent (Last)'].append(vals[-1] * 100)
+        df['Percent (Avg)'].append(ak.mean(vals) * 100)
+        df['Students (Last)'].append(vals[-1] * students[-1])
+        df['Students (Avg)'].append(ak.mean(vals * students))
+        
+    df = pd.DataFrame(df)
+
+    # Print Dataframe
+    logger.info(f"\n\nProgram Percentage ({name}, Last Academic Year): ")
+    print(df.sort_values("Percent (Last)", ascending=False))
+    
+    logger.info(f"\n\nProgram Percentage ({name}, Avg): ")
+    print(df.sort_values("Percent (Avg)", ascending=False))
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,61 @@
+import argparse
+import logging
+import os
+
+
+logger = logging.getLogger("utils")
+
+# Common Args
+def get_common_args(prog: str):
+    # Setup Args
+    parser = argparse.ArgumentParser(prog=prog)
+
+    # Analysis settings
+    parser.add_argument(
+        "-D",
+        "--data-dir",
+        help="Directory for downloaded data",
+        default=os.environ.get("DATA_DIRECTORY", "data")
+    )
+
+    # Debug
+    parser.add_argument(
+        "-d",
+        "--debug",
+        default=False,
+        action="store_true",
+        help="Enable verbose logging",
+    )
+    
+    return parser
+
+
+# LOGGING
+class Formatter(logging.Formatter):
+    grey = "\x1b[37m"
+    yellow = "\x1b[33m"
+    red = "\x1b[31m"
+    bold_red = "\x1b[1;31m"
+    reset = "\x1b[0m"
+    format = "%(asctime)s - %(name)-24s - %(levelname)-7s - %(message)s (%(filename)s:%(lineno)d)"
+
+    FORMATS = {
+        logging.DEBUG: logging.Formatter(grey + format + reset),
+        logging.INFO: logging.Formatter(format),
+        logging.WARNING: logging.Formatter(yellow + format + reset),
+        logging.ERROR: logging.Formatter(red + format + reset),
+        logging.CRITICAL: logging.Formatter(bold_red + format + reset),
+    }
+
+    def format(self, record):
+        return self.FORMATS.get(record.levelno).format(record)
+
+
+def setup_logging(debug: bool = False):
+    ch = logging.StreamHandler()
+    ch.setFormatter(Formatter())
+
+    logging.basicConfig(
+        handlers=[ch],
+        level=logging.DEBUG if debug else logging.INFO,
+    )