feat: final push for submission

This commit is contained in:
2025-12-06 03:59:17 +00:00
parent a6e6a51041
commit a2b6b6fedd
17 changed files with 644 additions and 48 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
data/
.ipynb_checkpoints/
__pycache__/

48
README.md Normal file
View File

@@ -0,0 +1,48 @@
# Educational Statistics Analysis Tool
Python utility to parse data from the Department of Education and National Science Foundation.
## Description
This project was created to answer the question "If I have $5, should I give it to the art department or to the history department?"
This project does so by gathering historical enrollment data from the College Scorecard by the Department of Education, indexable by year, institutiton, and subject, and comparing it against the National Science Foundation's data on funding, which provides data on subject funding indexable by year, institution, subject, and funding source.
## Getting Started
### Dependencies
* This project was built against Python 3.12
* Some scripts require excessive computational memory. An HPC Cluster or other server environment is recommended.
### Executing program
* Due to concerns with license restrictions on data, I have opted not to redistribute any raw or modified data files with this project. Instead, data must be downloaded from the DoE and NSF directly. This was automated with the `download.sh` file.
* The `.csv` files downloaded from the College Scorecard must be converted into an Awkward Array for effective data analysis. This is done with `convert.py`. **This step requires 80GB of RAM**, and as such, must be done on a machine with sufficient memory.
* Final plotting can be done with `run.py`. Sample outputs are provided for user convenience in `output.txt` (for tabular outputs) and in the `plots/` directory.
## Help Wanted!
A traditional repository would have the Issues tab in their git host acting as their feature request / help wanted section. However, I've not tested the Issues tab (as I'm also the only one who uses this git server instance), so a simple list will suffice:
- Implement cross-comparison of subject fields with the best available subject equivilent in the funding column
- Implement mass-scraping of the NSF site to gather funding data for all institutions across all years, rather than just 2025 data for UA
- Modify `convert.py` to accept the CSVs as Dask DataFrames rather than as Pandas tables, to allow for distributed and/or delayed computing of the conversion. This would allow for people with ordinary laptops to run the program
## Authors
* Nathan Nguyen
## Version History
* 0.1
* Initial Release
## License
I'm not a lawyer - most of this data probably has restrictions, anyways. If I was, though, I'd use the [NAME HERE](https://www.youtube.com/watch?v=XfELJU1mRMg) License - see the LICENSE.md file for details
## Acknowledgments
Inspiration, code snippets, etc.
* [ReadMe Template](https://gist.githubusercontent.com/DomPizzie/7a5ff55ffa9081f2de27c315f5018afc/raw/d59043abbb123089ad6602aba571121b71d91d7f/README-Template.md)

View File

@@ -1,29 +1,47 @@
import os
import yaml
import logging
import awkward as ak
import pandas as pd
import numpy as np
import tqdm
scorecard_dir = "data/scorecard"
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
import utils
print("Loading data.yaml")
if __name__ == "__main__":
# Setup Args
parser = utils.get_common_args(prog="CSV -> Pandas Scorecard Converter")
args = parser.parse_args()
# Setup Logging
utils.setup_logging(args.debug)
logger = logging.getLogger("CSVPandasConverter")
scorecard_dir = os.path.join(args.data_dir, "scorecard")
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
logger.info(f"Loading College Scorecard data from directory {scorecard_dir}")
logger.debug("Loading metadata from data.yaml")
with open(os.path.join(scorecard_dir, 'data.yaml'), 'r') as file:
data = yaml.safe_load(file)
print("Loading CSVs to dataframes")
logger.info("Loading all CSV files as Pandas dataframes")
files = [f'MERGED{i}_{(i + 1) % 100:02}_PP.csv' for i in tqdm.trange(1996, 2024)]
dataframes = [pd.read_csv(os.path.join(scorecard_dir, file)) for file in tqdm.tqdm(files)]
print("Appending extra rows where needed")
logger.info("Creating list of all UNITIDs across all files")
unit_ids = np.unique(np.hstack([frame.UNITID.to_numpy() for frame in tqdm.tqdm(dataframes)]))
logger.info("Appending extra columns to each year's dataframes to prepare for appending")
for i, frame in tqdm.tqdm(enumerate(dataframes)):
new_rows = pd.DataFrame({"UNITID": unit_ids[~np.isin(unit_ids, frame.UNITID)]})
dataframes[i] = pd.concat([frame, new_rows]).sort_values(by=["UNITID", "OPEID"])
print("Converting to Results Array")
logger.info("Converting to Results Array")
result = {}
for key, sec in tqdm.tqdm(data['dictionary'].items()):
if 'calculate' in sec:
@@ -49,12 +67,12 @@ for key, sec in tqdm.tqdm(data['dictionary'].items()):
obj = obj.astype(str)
section[parts[-1]] = obj
print("Cleanup: Deleting Dataframes from Memory")
logger.info("Cleanup: Deleting Dataframes from Memory")
del dataframes # Memory cleanup
print("Converting to Awkward Array")
logger.info("Converting to Awkward Array")
a = ak.Array(result)
del result # Memory cleanup
print("Writing to Disk")
logger.info("Writing to Disk")
ak.to_parquet(a, os.path.join(scorecard_dir, "merged.parquet"))

31
excel_tools.py Normal file
View File

@@ -0,0 +1,31 @@
import pandas as pd
# Keys
key_all = "All R&D expenditures"
key_federal = "Federal government"
key_state = "State and local government"
key_inst = "Institution funds"
key_business = "Business"
key_nonprofit = "Nonprofit organizations"
key_other = "All other sources"
keys = [key_federal, key_state, key_inst, key_business, key_nonprofit, key_other]
percent_suffix = " Percent"
def read_excel(file):
# Read excel
df = pd.read_excel(file, header=3)
# Remove fluff
df = df[:-4][[field for field in df[:-4] if 'Unnamed' not in field]]
# Add percent
for key in keys:
key_percent = key + percent_suffix
df[key_percent] = df[key] / df[key_all] * 100
return df
def print_data(data):
print(data)

View File

@@ -30,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 1,
"id": "eeee645e-48b5-4bdf-a386-e3354c8ac46a",
"metadata": {},
"outputs": [],
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "e18b60a6-be26-473b-9938-c9d18c28b8aa",
"metadata": {},
"outputs": [],
@@ -57,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 3,
"id": "45b02475-f4cb-4830-8367-05ef5eefaaee",
"metadata": {},
"outputs": [],
@@ -73,14 +73,16 @@
{
"cell_type": "markdown",
"id": "6129bee1-fadb-42cd-a146-c6c7a5217bf9",
"metadata": {},
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# National Stats"
]
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 6,
"id": "05ec674e-60b3-491c-b056-b7518fbb451c",
"metadata": {
"scrolled": true

223
output.txt Normal file
View File

@@ -0,0 +1,223 @@
2025-12-04 15:17:57,143 - DataAnalysis - INFO - Loading College Scorecard data from file data/scorecard/College_Scorecard_Raw_Data_05192025/merged.parquet (run.py:33)
2025-12-04 15:18:20,065 - DataAnalysis - INFO - Saving scorecard-only statistics (run.py:38)
2025-12-04 15:18:23,197 - Scorecard Data - INFO -
National Percentage (Last Academic Year): (scorecard_tools.py:135)
2025-12-04 15:18:23,209 - Scorecard Data - INFO -
National Percentage (Avg): (scorecard_tools.py:138)
2025-12-04 15:18:23,493 - Scorecard Data - INFO -
Program Percentage (The University of Alabama, Last Academic Year): (scorecard_tools.py:158)
2025-12-04 15:18:23,504 - Scorecard Data - INFO -
Program Percentage (The University of Alabama, Avg): (scorecard_tools.py:161)
/opt/conda/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
35 health 22.149643 12.027962 5960.320075 NaN
36 business_marketing 14.511786 17.399856 3653.717364 NaN
15 humanities 10.181786 9.546438 3624.366286 NaN
8 education 7.426786 6.705871 2231.885475 NaN
6 computer 7.148571 2.808726 2416.331129 NaN
31 mechanic_repair_technology 6.220357 0.811160 1382.265304 NaN
27 security_law_enforcement 4.961071 3.019295 1735.563754 NaN
10 engineering_technology 3.409286 1.377941 962.291996 NaN
20 multidiscipline 2.957857 1.964365 981.708071 NaN
12 family_consumer_science 2.880357 1.222658 799.266364 NaN
32 precision_production 2.701429 0.379921 1043.298893 NaN
17 biological 2.671786 4.908001 913.827739 NaN
29 social_science 2.312143 6.253486 567.835936 NaN
28 public_administration_social_service 1.677500 1.601816 531.436800 NaN
26 psychology 1.285714 5.195744 529.515436 NaN
4 communication 1.136786 3.192944 456.121346 NaN
9 engineering 0.972143 2.803155 333.565925 NaN
34 visual_performing 0.807500 3.903754 287.056654 NaN
7 personal_culinary 0.698571 0.444786 246.205414 NaN
11 language 0.633929 0.951896 224.890446 NaN
18 mathematics 0.611071 1.048517 205.602029 NaN
21 parks_recreation_fitness 0.593214 1.885363 225.476271 NaN
30 construction 0.573571 0.317400 232.164054 NaN
13 legal 0.529286 0.366860 151.130189 NaN
24 physical_science 0.421429 1.598245 125.394396 NaN
14 english 0.237500 2.671756 91.474121 NaN
37 history 0.208929 1.670524 76.740832 NaN
22 philosophy_religious 0.072500 0.677424 26.515154 NaN
3 ethnic_cultural_gender 0.007500 0.368425 1.937561 NaN
1 resources 0.000000 0.700875 0.000000 NaN
5 communications_technology 0.000000 0.215724 0.000000 NaN
2 architecture 0.000000 0.243856 0.000000 NaN
0 agriculture 0.000000 0.819862 0.000000 NaN
16 library 0.000000 0.012911 0.000000 NaN
19 military 0.000000 0.035292 0.000000 NaN
25 science_technology 0.000000 0.080646 0.000000 NaN
23 theology_religious_vocation 0.000000 0.432991 0.000000 NaN
33 transportation 0.000000 0.331288 0.000000 NaN
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
36 business_marketing 14.511786 17.399856 3653.717364 NaN
35 health 22.149643 12.027962 5960.320075 NaN
15 humanities 10.181786 9.546438 3624.366286 NaN
8 education 7.426786 6.705871 2231.885475 NaN
29 social_science 2.312143 6.253486 567.835936 NaN
26 psychology 1.285714 5.195744 529.515436 NaN
17 biological 2.671786 4.908001 913.827739 NaN
34 visual_performing 0.807500 3.903754 287.056654 NaN
4 communication 1.136786 3.192944 456.121346 NaN
27 security_law_enforcement 4.961071 3.019295 1735.563754 NaN
6 computer 7.148571 2.808726 2416.331129 NaN
9 engineering 0.972143 2.803155 333.565925 NaN
14 english 0.237500 2.671756 91.474121 NaN
20 multidiscipline 2.957857 1.964365 981.708071 NaN
21 parks_recreation_fitness 0.593214 1.885363 225.476271 NaN
37 history 0.208929 1.670524 76.740832 NaN
28 public_administration_social_service 1.677500 1.601816 531.436800 NaN
24 physical_science 0.421429 1.598245 125.394396 NaN
10 engineering_technology 3.409286 1.377941 962.291996 NaN
12 family_consumer_science 2.880357 1.222658 799.266364 NaN
18 mathematics 0.611071 1.048517 205.602029 NaN
11 language 0.633929 0.951896 224.890446 NaN
0 agriculture 0.000000 0.819862 0.000000 NaN
31 mechanic_repair_technology 6.220357 0.811160 1382.265304 NaN
1 resources 0.000000 0.700875 0.000000 NaN
22 philosophy_religious 0.072500 0.677424 26.515154 NaN
7 personal_culinary 0.698571 0.444786 246.205414 NaN
23 theology_religious_vocation 0.000000 0.432991 0.000000 NaN
32 precision_production 2.701429 0.379921 1043.298893 NaN
3 ethnic_cultural_gender 0.007500 0.368425 1.937561 NaN
13 legal 0.529286 0.366860 151.130189 NaN
33 transportation 0.000000 0.331288 0.000000 NaN
30 construction 0.573571 0.317400 232.164054 NaN
2 architecture 0.000000 0.243856 0.000000 NaN
5 communications_technology 0.000000 0.215724 0.000000 NaN
25 science_technology 0.000000 0.080646 0.000000 NaN
19 military 0.000000 0.035292 0.000000 NaN
16 library 0.000000 0.012911 0.000000 NaN
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
36 business_marketing 30.11 28.734643 10662.8543 7409.063086
9 engineering 9.66 8.218571 3420.8958 2251.159975
35 health 9.66 9.055000 3420.8958 2404.925375
4 communication 9.42 10.607143 3335.9046 2703.289471
29 social_science 7.99 4.308214 2829.4987 1147.649796
12 family_consumer_science 6.96 7.195714 2464.7448 1848.827150
26 psychology 5.13 3.813929 1816.6869 978.664311
17 biological 3.89 3.384643 1377.5657 878.928314
21 parks_recreation_fitness 2.86 0.308929 1012.8118 107.747239
8 education 2.72 7.219643 963.2336 1749.198443
34 visual_performing 2.17 3.532143 768.4621 860.670914
20 multidiscipline 1.72 2.466786 609.1036 591.093271
6 computer 1.44 0.937500 509.9472 248.493907
37 history 1.17 1.755714 414.3321 421.027957
28 public_administration_social_service 1.09 1.165357 386.0017 286.223114
24 physical_science 1.06 0.981071 375.3778 256.140975
14 english 0.97 1.992857 343.5061 458.707082
18 mathematics 0.89 0.571786 315.1757 164.741461
11 language 0.45 0.713929 159.3585 173.877018
1 resources 0.38 0.251786 134.5694 77.018761
22 philosophy_religious 0.18 0.357143 63.7434 89.032021
3 ethnic_cultural_gender 0.09 0.299286 31.8717 73.607264
5 communications_technology 0.00 0.000000 0.0000 0.000000
2 architecture 0.00 0.017500 0.0000 2.880346
0 agriculture 0.00 0.000000 0.0000 0.000000
16 library 0.00 0.000000 0.0000 0.000000
10 engineering_technology 0.00 0.000000 0.0000 0.000000
7 personal_culinary 0.00 0.000000 0.0000 0.000000
15 humanities 0.00 0.001429 0.0000 0.228757
13 legal 0.00 0.000000 0.0000 0.000000
19 military 0.00 0.000000 0.0000 0.000000
27 security_law_enforcement 0.00 2.109286 0.0000 521.904729
23 theology_religious_vocation 0.00 0.000000 0.0000 0.000000
25 science_technology 0.00 0.000000 0.0000 0.000000
33 transportation 0.00 0.000000 0.0000 0.000000
32 precision_production 0.00 0.000000 0.0000 0.000000
31 mechanic_repair_technology 0.00 0.000000 0.0000 0.000000
30 construction 0.00 0.000000 0.0000 0.000000
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
36 business_marketing 30.11 28.734643 10662.8543 7409.063086
4 communication 9.42 10.607143 3335.9046 2703.289471
35 health 9.66 9.055000 3420.8958 2404.925375
9 engineering 9.66 8.218571 3420.8958 2251.159975
8 education 2.72 7.219643 963.2336 1749.198443
12 family_consumer_science 6.96 7.195714 2464.7448 1848.827150
29 social_science 7.99 4.308214 2829.4987 1147.649796
26 psychology 5.13 3.813929 1816.6869 978.664311
34 visual_performing 2.17 3.532143 768.4621 860.670914
17 biological 3.89 3.384643 1377.5657 878.928314
20 multidiscipline 1.72 2.466786 609.1036 591.093271
27 security_law_enforcement 0.00 2.109286 0.0000 521.904729
14 english 0.97 1.992857 343.5061 458.707082
37 history 1.17 1.755714 414.3321 421.027957
28 public_administration_social_service 1.09 1.165357 386.0017 286.223114
24 physical_science 1.06 0.981071 375.3778 256.140975
6 computer 1.44 0.937500 509.9472 248.493907
11 language 0.45 0.713929 159.3585 173.877018
18 mathematics 0.89 0.571786 315.1757 164.741461
22 philosophy_religious 0.18 0.357143 63.7434 89.032021
21 parks_recreation_fitness 2.86 0.308929 1012.8118 107.747239
3 ethnic_cultural_gender 0.09 0.299286 31.8717 73.607264
1 resources 0.38 0.251786 134.5694 77.018761
2 architecture 0.00 0.017500 0.0000 2.880346
15 humanities 0.00 0.001429 0.0000 0.228757
0 agriculture 0.00 0.000000 0.0000 0.000000
5 communications_technology 0.00 0.000000 0.0000 0.000000
7 personal_culinary 0.00 0.000000 0.0000 0.000000
10 engineering_technology 0.00 0.000000 0.0000 0.000000
13 legal 0.00 0.000000 0.0000 0.000000
19 military 0.00 0.000000 0.0000 0.000000
16 library 0.00 0.000000 0.0000 0.000000
23 theology_religious_vocation 0.00 0.000000 0.0000 0.000000
25 science_technology 0.00 0.000000 0.0000 0.000000
33 transportation 0.00 0.000000 0.0000 0.000000
32 precision_production 0.00 0.000000 0.0000 0.000000
31 mechanic_repair_technology 0.00 0.000000 0.0000 0.000000
30 construction 0.00 0.000000 0.0000 0.000000
Field All R&D expenditures ... Nonprofit organizations Percent All other sources Percent
0 All R&D fields 184880.0 ... 0.948183 0.376460
1 Science 92834.0 ... 0.613999 0.187431
2 Computer and information sciences 18813.0 ... 0.015946 0.212619
3 Geosciences, atmospheric sciences, and ocean s... 16822.0 ... 0.606349 0.095114
4 Atmospheric science and meteorology 0.0 ... NaN NaN
5 Geological and earth sciences 14248.0 ... 0.715890 0.112296
6 Ocean sciences and marine sciences 2574.0 ... 0.000000 0.000000
7 Geosciences, atmospheric sciences, and ocean s... 0.0 ... NaN NaN
8 Life sciences 23853.0 ... 1.224165 0.314426
9 Agricultural sciences 11.0 ... 0.000000 0.000000
10 Biological and biomedical sciences 10055.0 ... 0.616609 0.159125
11 Health sciences 13736.0 ... 1.674432 0.422248
12 Natural resources and conservation 0.0 ... NaN NaN
13 Life sciences nec 51.0 ... 0.000000 1.960784
14 Mathematics and statistics 2918.0 ... 0.959561 0.000000
15 Physical sciences 11124.0 ... 0.107875 0.000000
16 Astronomy and astrophysics 271.0 ... 0.000000 0.000000
17 Chemistry 7938.0 ... 0.151172 0.000000
18 Materials science 0.0 ... NaN NaN
19 Physics 2911.0 ... 0.000000 0.000000
20 Physical sciences nec 4.0 ... 0.000000 0.000000
21 Psychology 5690.0 ... 0.351494 0.000000
22 Social sciences 13350.0 ... 0.846442 0.322097
23 Anthropology 792.0 ... 1.767677 5.429293
24 Economics 3707.0 ... 0.000000 0.000000
25 Political science and government 898.0 ... 1.559020 0.000000
26 Sociology, demography, and population studies 2908.0 ... 0.000000 0.000000
27 Social sciences nec 5045.0 ... 1.684836 0.000000
28 Sciences nec 264.0 ... 0.000000 0.000000
29 Engineering 51029.0 ... 1.312979 0.850497
30 Aerospace, aeronautical, and astronautical eng... 4665.0 ... 0.000000 0.000000
31 Bioengineering and biomedical engineering 0.0 ... NaN NaN
32 Chemical engineering 6647.0 ... 2.888521 0.000000
33 Civil engineering 15198.0 ... 0.782998 1.796289
34 Electrical, electronic, and communications eng... 10299.0 ... 3.359549 0.854452
35 Industrial and manufacturing engineering 0.0 ... NaN NaN
36 Mechanical engineering 7056.0 ... 0.000000 0.255102
37 Metallurgical and materials engineering 6473.0 ... 0.000000 0.849683
38 Engineering nec 691.0 ... 1.881331 0.000000
39 Non-S&E 41017.0 ... 1.250701 0.214545
40 Business management and business administration 11919.0 ... 0.092290 0.000000
41 Communication and communications technologies 2675.0 ... 0.523364 0.074766
42 Education 9731.0 ... 1.459254 0.000000
43 Humanities 4813.0 ... 1.059630 1.225847
44 Law 2076.0 ... 9.633911 0.000000
45 Social work 4557.0 ... 0.987492 0.285275
46 Visual and performing arts 2803.0 ... 0.356761 0.499465
47 Non-S&E nec 2443.0 ... 1.637331 0.000000
[48 rows x 14 columns]

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

50
run.py Normal file
View File

@@ -0,0 +1,50 @@
import os
import logging
import awkward as ak
import utils
import scorecard_tools
import excel_tools
if __name__ == "__main__":
# Setup Args
parser = utils.get_common_args(prog="Main Data Parser")
parser.add_argument(
"-o",
"--output-dir",
default="plots",
help="Directory to save generated plots",
)
parser.add_argument("-s", "--school", default="The University of Alabama")
args = parser.parse_args()
# Setup Logging
utils.setup_logging(args.debug)
logger = logging.getLogger("DataAnalysis")
# Load Scorecard
scorecard_dir = os.path.join(args.data_dir, "scorecard")
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
scorecard_file = os.path.join(scorecard_dir, "merged.parquet")
logger.info(f"Loading College Scorecard data from file {scorecard_file}")
scorecard_data = ak.from_parquet(scorecard_file)
os.makedirs(args.output_dir, exist_ok=True)
logger.info("Saving scorecard-only statistics")
scorecard_data = scorecard_tools.mask_valid(scorecard_data)
scorecard_tools.save_scorecard_stats(args.output_dir, scorecard_data)
scorecard_tools.save_scorecard_stats_national(args.output_dir, scorecard_data)
scorecard_tools.save_scorecard_stats_school(args.output_dir, scorecard_data, args.school)
# Print
scorecard_tools.print_enrollment_national(scorecard_data)
scorecard_tools.print_enrollment(scorecard_data, args.school)
# Load Excel
excel_data = excel_tools.read_excel(os.path.join(args.data_dir, "ua.xlsx"))
excel_tools.print_data(excel_data)

162
scorecard_tools.py Normal file
View File

@@ -0,0 +1,162 @@
import os
import logging
import awkward as ak
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
logger = logging.getLogger("Scorecard Data")
def mask_valid(data):
# Strip out NaN and zero placeholders
mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
mask = mask_nan & mask_all_zero
data = data[mask]
return data
def mask_school(data, name):
return data[data.school.name[:, -1] == name][0]
# Percentage enrollment per field over time averaged across the nation
def save_scorecard_stats_national(output_dir, data, ext="png"):
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Percent enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if ak.all(mean < 0.05):
continue
plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.savefig(os.path.join(output_dir, f"national-enrollment-percent.{ext}"))
plt.clf()
# Students enrolled
for key in prog_percentage.fields:
# Find the mean across the nation
mean_perc = ak.mean(prog_percentage[key], axis=0)
if ak.all(mean_perc < 0.05):
continue
mean = ak.mean(prog_percentage[key] * students, axis=0)
plt.plot(mean, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.savefig(os.path.join(output_dir, f"national-enrollment-students.{ext}"))
plt.clf()
# Percentage enrollment per field over time averaged across one school
def save_scorecard_stats_school(output_dir, data, school: str, ext="png"):
data = mask_school(data, school)
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Plot percent enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if mean < 0.05:
continue
plt.plot(prog_percentage[key] * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title(f"Average Enrollment ({school})")
plt.legend()
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-percent.{ext}"))
plt.clf()
# Plot student enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if mean < 0.05:
continue
plt.plot(prog_percentage[key] * students, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title(f"Average Enrollment ({school})")
plt.legend()
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-students.{ext}"))
plt.clf()
def save_scorecard_stats(output_dir, data, ext="png"):
# Students Enrolled Distribution (Average)
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-avg.{ext}"))
plt.clf()
# Students Enrolled Distribution (Last Year)
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-last-year.{ext}"))
plt.clf()
# Print national enrollment stats
def print_enrollment_national(data):
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Create dataframe of enrollment across the nation (latest academic year and average)
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
for key in df['Name']:
vals = prog_percentage[key]
df['Percent (Last)'].append(ak.mean(vals[-1]) * 100)
df['Percent (Avg)'].append(ak.mean(vals) * 100)
df['Students (Last)'].append(ak.mean(vals[-1] * students[-1]))
df['Students (Avg)'].append(ak.mean(vals * students))
df = pd.DataFrame(df)
# Print Dataframe
logger.info("\n\nNational Percentage (Last Academic Year): ")
print(df.sort_values("Percent (Last)", ascending=False))
logger.info("\n\nNational Percentage (Avg): ")
print(df.sort_values("Percent (Avg)", ascending=False))
def print_enrollment(data, name: str):
data = mask_school(data, name)
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Create dataframe of enrollment across the nation (latest academic year and average)
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
for key in df['Name']:
vals = prog_percentage[key]
df['Percent (Last)'].append(vals[-1] * 100)
df['Percent (Avg)'].append(ak.mean(vals) * 100)
df['Students (Last)'].append(vals[-1] * students[-1])
df['Students (Avg)'].append(ak.mean(vals * students))
df = pd.DataFrame(df)
# Print Dataframe
logger.info(f"\n\nProgram Percentage ({name}, Last Academic Year): ")
print(df.sort_values("Percent (Last)", ascending=False))
logger.info(f"\n\nProgram Percentage ({name}, Avg): ")
print(df.sort_values("Percent (Avg)", ascending=False))

61
utils.py Normal file
View File

@@ -0,0 +1,61 @@
import argparse
import logging
import os
logger = logging.getLogger("utils")
# Common Args
def get_common_args(prog: str):
# Setup Args
parser = argparse.ArgumentParser(prog=prog)
# Analysis settings
parser.add_argument(
"-D",
"--data-dir",
help="Directory for downloaded data",
default=os.environ.get("DATA_DIRECTORY", "data")
)
# Debug
parser.add_argument(
"-d",
"--debug",
default=False,
action="store_true",
help="Enable verbose logging",
)
return parser
# LOGGING
class Formatter(logging.Formatter):
grey = "\x1b[37m"
yellow = "\x1b[33m"
red = "\x1b[31m"
bold_red = "\x1b[1;31m"
reset = "\x1b[0m"
format = "%(asctime)s - %(name)-24s - %(levelname)-7s - %(message)s (%(filename)s:%(lineno)d)"
FORMATS = {
logging.DEBUG: logging.Formatter(grey + format + reset),
logging.INFO: logging.Formatter(format),
logging.WARNING: logging.Formatter(yellow + format + reset),
logging.ERROR: logging.Formatter(red + format + reset),
logging.CRITICAL: logging.Formatter(bold_red + format + reset),
}
def format(self, record):
return self.FORMATS.get(record.levelno).format(record)
def setup_logging(debug: bool = False):
ch = logging.StreamHandler()
ch.setFormatter(Formatter())
logging.basicConfig(
handlers=[ch],
level=logging.DEBUG if debug else logging.INFO,
)