feat: final push for submission
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
data/
|
||||
.ipynb_checkpoints/
|
||||
__pycache__/
|
||||
|
||||
48
README.md
Normal file
@@ -0,0 +1,48 @@
|
||||
# Educational Statistics Analysis Tool
|
||||
|
||||
Python utility to parse data from the Department of Education and National Science Foundation.
|
||||
|
||||
## Description
|
||||
|
||||
This project was created to answer the question "If I have $5, should I give it to the art department or to the history department?"
|
||||
|
||||
This project does so by gathering historical enrollment data from the College Scorecard by the Department of Education, indexable by year, institutiton, and subject, and comparing it against the National Science Foundation's data on funding, which provides data on subject funding indexable by year, institution, subject, and funding source.
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Dependencies
|
||||
|
||||
* This project was built against Python 3.12
|
||||
* Some scripts require excessive computational memory. An HPC Cluster or other server environment is recommended.
|
||||
|
||||
### Executing program
|
||||
|
||||
* Due to concerns with license restrictions on data, I have opted not to redistribute any raw or modified data files with this project. Instead, data must be downloaded from the DoE and NSF directly. This was automated with the `download.sh` file.
|
||||
* The `.csv` files downloaded from the College Scorecard must be converted into an Awkward Array for effective data analysis. This is done with `convert.py`. **This step requires 80GB of RAM**, and as such, must be done on a machine with sufficient memory.
|
||||
* Final plotting can be done with `run.py`. Sample outputs are provided for user convenience in `output.txt` (for tabular outputs) and in the `plots/` directory.
|
||||
|
||||
## Help Wanted!
|
||||
|
||||
A traditional repository would have the Issues tab in their git host acting as their feature request / help wanted section. However, I've not tested the Issues tab (as I'm also the only one who uses this git server instance), so a simple list will suffice:
|
||||
|
||||
- Implement cross-comparison of subject fields with the best available subject equivilent in the funding column
|
||||
- Implement mass-scraping of the NSF site to gather funding data for all institutions across all years, rather than just 2025 data for UA
|
||||
- Modify `convert.py` to accept the CSVs as Dask DataFrames rather than as Pandas tables, to allow for distributed and/or delayed computing of the conversion. This would allow for people with ordinary laptops to run the program
|
||||
|
||||
## Authors
|
||||
|
||||
* Nathan Nguyen
|
||||
|
||||
## Version History
|
||||
|
||||
* 0.1
|
||||
* Initial Release
|
||||
|
||||
## License
|
||||
|
||||
I'm not a lawyer - most of this data probably has restrictions, anyways. If I was, though, I'd use the [NAME HERE](https://www.youtube.com/watch?v=XfELJU1mRMg) License - see the LICENSE.md file for details
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
Inspiration, code snippets, etc.
|
||||
* [ReadMe Template](https://gist.githubusercontent.com/DomPizzie/7a5ff55ffa9081f2de27c315f5018afc/raw/d59043abbb123089ad6602aba571121b71d91d7f/README-Template.md)
|
||||
58
convert.py
@@ -1,31 +1,49 @@
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
|
||||
import awkward as ak
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import tqdm
|
||||
|
||||
scorecard_dir = "data/scorecard"
|
||||
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
|
||||
import utils
|
||||
|
||||
print("Loading data.yaml")
|
||||
with open(os.path.join(scorecard_dir, 'data.yaml'), 'r') as file:
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Setup Args
|
||||
parser = utils.get_common_args(prog="CSV -> Pandas Scorecard Converter")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup Logging
|
||||
utils.setup_logging(args.debug)
|
||||
logger = logging.getLogger("CSVPandasConverter")
|
||||
|
||||
scorecard_dir = os.path.join(args.data_dir, "scorecard")
|
||||
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
|
||||
logger.info(f"Loading College Scorecard data from directory {scorecard_dir}")
|
||||
|
||||
logger.debug("Loading metadata from data.yaml")
|
||||
with open(os.path.join(scorecard_dir, 'data.yaml'), 'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
print("Loading CSVs to dataframes")
|
||||
files = [f'MERGED{i}_{(i + 1) % 100:02}_PP.csv' for i in tqdm.trange(1996, 2024)]
|
||||
dataframes = [pd.read_csv(os.path.join(scorecard_dir, file)) for file in tqdm.tqdm(files)]
|
||||
logger.info("Loading all CSV files as Pandas dataframes")
|
||||
files = [f'MERGED{i}_{(i + 1) % 100:02}_PP.csv' for i in tqdm.trange(1996, 2024)]
|
||||
dataframes = [pd.read_csv(os.path.join(scorecard_dir, file)) for file in tqdm.tqdm(files)]
|
||||
|
||||
print("Appending extra rows where needed")
|
||||
unit_ids = np.unique(np.hstack([frame.UNITID.to_numpy() for frame in tqdm.tqdm(dataframes)]))
|
||||
for i, frame in tqdm.tqdm(enumerate(dataframes)):
|
||||
logger.info("Creating list of all UNITIDs across all files")
|
||||
unit_ids = np.unique(np.hstack([frame.UNITID.to_numpy() for frame in tqdm.tqdm(dataframes)]))
|
||||
|
||||
logger.info("Appending extra columns to each year's dataframes to prepare for appending")
|
||||
for i, frame in tqdm.tqdm(enumerate(dataframes)):
|
||||
new_rows = pd.DataFrame({"UNITID": unit_ids[~np.isin(unit_ids, frame.UNITID)]})
|
||||
dataframes[i] = pd.concat([frame, new_rows]).sort_values(by=["UNITID", "OPEID"])
|
||||
|
||||
print("Converting to Results Array")
|
||||
result = {}
|
||||
for key, sec in tqdm.tqdm(data['dictionary'].items()):
|
||||
logger.info("Converting to Results Array")
|
||||
result = {}
|
||||
for key, sec in tqdm.tqdm(data['dictionary'].items()):
|
||||
if 'calculate' in sec:
|
||||
continue
|
||||
|
||||
@@ -49,12 +67,12 @@ for key, sec in tqdm.tqdm(data['dictionary'].items()):
|
||||
obj = obj.astype(str)
|
||||
section[parts[-1]] = obj
|
||||
|
||||
print("Cleanup: Deleting Dataframes from Memory")
|
||||
del dataframes # Memory cleanup
|
||||
logger.info("Cleanup: Deleting Dataframes from Memory")
|
||||
del dataframes # Memory cleanup
|
||||
|
||||
print("Converting to Awkward Array")
|
||||
a = ak.Array(result)
|
||||
del result # Memory cleanup
|
||||
logger.info("Converting to Awkward Array")
|
||||
a = ak.Array(result)
|
||||
del result # Memory cleanup
|
||||
|
||||
print("Writing to Disk")
|
||||
ak.to_parquet(a, os.path.join(scorecard_dir, "merged.parquet"))
|
||||
logger.info("Writing to Disk")
|
||||
ak.to_parquet(a, os.path.join(scorecard_dir, "merged.parquet"))
|
||||
31
excel_tools.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# Keys
|
||||
key_all = "All R&D expenditures"
|
||||
key_federal = "Federal government"
|
||||
key_state = "State and local government"
|
||||
key_inst = "Institution funds"
|
||||
key_business = "Business"
|
||||
key_nonprofit = "Nonprofit organizations"
|
||||
key_other = "All other sources"
|
||||
keys = [key_federal, key_state, key_inst, key_business, key_nonprofit, key_other]
|
||||
percent_suffix = " Percent"
|
||||
|
||||
|
||||
def read_excel(file):
|
||||
# Read excel
|
||||
df = pd.read_excel(file, header=3)
|
||||
|
||||
# Remove fluff
|
||||
df = df[:-4][[field for field in df[:-4] if 'Unnamed' not in field]]
|
||||
|
||||
# Add percent
|
||||
for key in keys:
|
||||
key_percent = key + percent_suffix
|
||||
df[key_percent] = df[key] / df[key_all] * 100
|
||||
|
||||
return df
|
||||
|
||||
def print_data(data):
|
||||
print(data)
|
||||
@@ -30,7 +30,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 1,
|
||||
"id": "eeee645e-48b5-4bdf-a386-e3354c8ac46a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"id": "e18b60a6-be26-473b-9938-c9d18c28b8aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -57,7 +57,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"execution_count": 3,
|
||||
"id": "45b02475-f4cb-4830-8367-05ef5eefaaee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -73,14 +73,16 @@
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6129bee1-fadb-42cd-a146-c6c7a5217bf9",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"# National Stats"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"execution_count": 6,
|
||||
"id": "05ec674e-60b3-491c-b056-b7518fbb451c",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
|
||||
223
output.txt
Normal file
@@ -0,0 +1,223 @@
|
||||
2025-12-04 15:17:57,143 - DataAnalysis - INFO - Loading College Scorecard data from file data/scorecard/College_Scorecard_Raw_Data_05192025/merged.parquet (run.py:33)
|
||||
2025-12-04 15:18:20,065 - DataAnalysis - INFO - Saving scorecard-only statistics (run.py:38)
|
||||
2025-12-04 15:18:23,197 - Scorecard Data - INFO -
|
||||
|
||||
National Percentage (Last Academic Year): (scorecard_tools.py:135)
|
||||
2025-12-04 15:18:23,209 - Scorecard Data - INFO -
|
||||
|
||||
National Percentage (Avg): (scorecard_tools.py:138)
|
||||
2025-12-04 15:18:23,493 - Scorecard Data - INFO -
|
||||
|
||||
Program Percentage (The University of Alabama, Last Academic Year): (scorecard_tools.py:158)
|
||||
2025-12-04 15:18:23,504 - Scorecard Data - INFO -
|
||||
|
||||
Program Percentage (The University of Alabama, Avg): (scorecard_tools.py:161)
|
||||
/opt/conda/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
|
||||
warn("Workbook contains no default style, apply openpyxl's default")
|
||||
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
|
||||
35 health 22.149643 12.027962 5960.320075 NaN
|
||||
36 business_marketing 14.511786 17.399856 3653.717364 NaN
|
||||
15 humanities 10.181786 9.546438 3624.366286 NaN
|
||||
8 education 7.426786 6.705871 2231.885475 NaN
|
||||
6 computer 7.148571 2.808726 2416.331129 NaN
|
||||
31 mechanic_repair_technology 6.220357 0.811160 1382.265304 NaN
|
||||
27 security_law_enforcement 4.961071 3.019295 1735.563754 NaN
|
||||
10 engineering_technology 3.409286 1.377941 962.291996 NaN
|
||||
20 multidiscipline 2.957857 1.964365 981.708071 NaN
|
||||
12 family_consumer_science 2.880357 1.222658 799.266364 NaN
|
||||
32 precision_production 2.701429 0.379921 1043.298893 NaN
|
||||
17 biological 2.671786 4.908001 913.827739 NaN
|
||||
29 social_science 2.312143 6.253486 567.835936 NaN
|
||||
28 public_administration_social_service 1.677500 1.601816 531.436800 NaN
|
||||
26 psychology 1.285714 5.195744 529.515436 NaN
|
||||
4 communication 1.136786 3.192944 456.121346 NaN
|
||||
9 engineering 0.972143 2.803155 333.565925 NaN
|
||||
34 visual_performing 0.807500 3.903754 287.056654 NaN
|
||||
7 personal_culinary 0.698571 0.444786 246.205414 NaN
|
||||
11 language 0.633929 0.951896 224.890446 NaN
|
||||
18 mathematics 0.611071 1.048517 205.602029 NaN
|
||||
21 parks_recreation_fitness 0.593214 1.885363 225.476271 NaN
|
||||
30 construction 0.573571 0.317400 232.164054 NaN
|
||||
13 legal 0.529286 0.366860 151.130189 NaN
|
||||
24 physical_science 0.421429 1.598245 125.394396 NaN
|
||||
14 english 0.237500 2.671756 91.474121 NaN
|
||||
37 history 0.208929 1.670524 76.740832 NaN
|
||||
22 philosophy_religious 0.072500 0.677424 26.515154 NaN
|
||||
3 ethnic_cultural_gender 0.007500 0.368425 1.937561 NaN
|
||||
1 resources 0.000000 0.700875 0.000000 NaN
|
||||
5 communications_technology 0.000000 0.215724 0.000000 NaN
|
||||
2 architecture 0.000000 0.243856 0.000000 NaN
|
||||
0 agriculture 0.000000 0.819862 0.000000 NaN
|
||||
16 library 0.000000 0.012911 0.000000 NaN
|
||||
19 military 0.000000 0.035292 0.000000 NaN
|
||||
25 science_technology 0.000000 0.080646 0.000000 NaN
|
||||
23 theology_religious_vocation 0.000000 0.432991 0.000000 NaN
|
||||
33 transportation 0.000000 0.331288 0.000000 NaN
|
||||
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
|
||||
36 business_marketing 14.511786 17.399856 3653.717364 NaN
|
||||
35 health 22.149643 12.027962 5960.320075 NaN
|
||||
15 humanities 10.181786 9.546438 3624.366286 NaN
|
||||
8 education 7.426786 6.705871 2231.885475 NaN
|
||||
29 social_science 2.312143 6.253486 567.835936 NaN
|
||||
26 psychology 1.285714 5.195744 529.515436 NaN
|
||||
17 biological 2.671786 4.908001 913.827739 NaN
|
||||
34 visual_performing 0.807500 3.903754 287.056654 NaN
|
||||
4 communication 1.136786 3.192944 456.121346 NaN
|
||||
27 security_law_enforcement 4.961071 3.019295 1735.563754 NaN
|
||||
6 computer 7.148571 2.808726 2416.331129 NaN
|
||||
9 engineering 0.972143 2.803155 333.565925 NaN
|
||||
14 english 0.237500 2.671756 91.474121 NaN
|
||||
20 multidiscipline 2.957857 1.964365 981.708071 NaN
|
||||
21 parks_recreation_fitness 0.593214 1.885363 225.476271 NaN
|
||||
37 history 0.208929 1.670524 76.740832 NaN
|
||||
28 public_administration_social_service 1.677500 1.601816 531.436800 NaN
|
||||
24 physical_science 0.421429 1.598245 125.394396 NaN
|
||||
10 engineering_technology 3.409286 1.377941 962.291996 NaN
|
||||
12 family_consumer_science 2.880357 1.222658 799.266364 NaN
|
||||
18 mathematics 0.611071 1.048517 205.602029 NaN
|
||||
11 language 0.633929 0.951896 224.890446 NaN
|
||||
0 agriculture 0.000000 0.819862 0.000000 NaN
|
||||
31 mechanic_repair_technology 6.220357 0.811160 1382.265304 NaN
|
||||
1 resources 0.000000 0.700875 0.000000 NaN
|
||||
22 philosophy_religious 0.072500 0.677424 26.515154 NaN
|
||||
7 personal_culinary 0.698571 0.444786 246.205414 NaN
|
||||
23 theology_religious_vocation 0.000000 0.432991 0.000000 NaN
|
||||
32 precision_production 2.701429 0.379921 1043.298893 NaN
|
||||
3 ethnic_cultural_gender 0.007500 0.368425 1.937561 NaN
|
||||
13 legal 0.529286 0.366860 151.130189 NaN
|
||||
33 transportation 0.000000 0.331288 0.000000 NaN
|
||||
30 construction 0.573571 0.317400 232.164054 NaN
|
||||
2 architecture 0.000000 0.243856 0.000000 NaN
|
||||
5 communications_technology 0.000000 0.215724 0.000000 NaN
|
||||
25 science_technology 0.000000 0.080646 0.000000 NaN
|
||||
19 military 0.000000 0.035292 0.000000 NaN
|
||||
16 library 0.000000 0.012911 0.000000 NaN
|
||||
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
|
||||
36 business_marketing 30.11 28.734643 10662.8543 7409.063086
|
||||
9 engineering 9.66 8.218571 3420.8958 2251.159975
|
||||
35 health 9.66 9.055000 3420.8958 2404.925375
|
||||
4 communication 9.42 10.607143 3335.9046 2703.289471
|
||||
29 social_science 7.99 4.308214 2829.4987 1147.649796
|
||||
12 family_consumer_science 6.96 7.195714 2464.7448 1848.827150
|
||||
26 psychology 5.13 3.813929 1816.6869 978.664311
|
||||
17 biological 3.89 3.384643 1377.5657 878.928314
|
||||
21 parks_recreation_fitness 2.86 0.308929 1012.8118 107.747239
|
||||
8 education 2.72 7.219643 963.2336 1749.198443
|
||||
34 visual_performing 2.17 3.532143 768.4621 860.670914
|
||||
20 multidiscipline 1.72 2.466786 609.1036 591.093271
|
||||
6 computer 1.44 0.937500 509.9472 248.493907
|
||||
37 history 1.17 1.755714 414.3321 421.027957
|
||||
28 public_administration_social_service 1.09 1.165357 386.0017 286.223114
|
||||
24 physical_science 1.06 0.981071 375.3778 256.140975
|
||||
14 english 0.97 1.992857 343.5061 458.707082
|
||||
18 mathematics 0.89 0.571786 315.1757 164.741461
|
||||
11 language 0.45 0.713929 159.3585 173.877018
|
||||
1 resources 0.38 0.251786 134.5694 77.018761
|
||||
22 philosophy_religious 0.18 0.357143 63.7434 89.032021
|
||||
3 ethnic_cultural_gender 0.09 0.299286 31.8717 73.607264
|
||||
5 communications_technology 0.00 0.000000 0.0000 0.000000
|
||||
2 architecture 0.00 0.017500 0.0000 2.880346
|
||||
0 agriculture 0.00 0.000000 0.0000 0.000000
|
||||
16 library 0.00 0.000000 0.0000 0.000000
|
||||
10 engineering_technology 0.00 0.000000 0.0000 0.000000
|
||||
7 personal_culinary 0.00 0.000000 0.0000 0.000000
|
||||
15 humanities 0.00 0.001429 0.0000 0.228757
|
||||
13 legal 0.00 0.000000 0.0000 0.000000
|
||||
19 military 0.00 0.000000 0.0000 0.000000
|
||||
27 security_law_enforcement 0.00 2.109286 0.0000 521.904729
|
||||
23 theology_religious_vocation 0.00 0.000000 0.0000 0.000000
|
||||
25 science_technology 0.00 0.000000 0.0000 0.000000
|
||||
33 transportation 0.00 0.000000 0.0000 0.000000
|
||||
32 precision_production 0.00 0.000000 0.0000 0.000000
|
||||
31 mechanic_repair_technology 0.00 0.000000 0.0000 0.000000
|
||||
30 construction 0.00 0.000000 0.0000 0.000000
|
||||
Name Percent (Last) Percent (Avg) Students (Last) Students (Avg)
|
||||
36 business_marketing 30.11 28.734643 10662.8543 7409.063086
|
||||
4 communication 9.42 10.607143 3335.9046 2703.289471
|
||||
35 health 9.66 9.055000 3420.8958 2404.925375
|
||||
9 engineering 9.66 8.218571 3420.8958 2251.159975
|
||||
8 education 2.72 7.219643 963.2336 1749.198443
|
||||
12 family_consumer_science 6.96 7.195714 2464.7448 1848.827150
|
||||
29 social_science 7.99 4.308214 2829.4987 1147.649796
|
||||
26 psychology 5.13 3.813929 1816.6869 978.664311
|
||||
34 visual_performing 2.17 3.532143 768.4621 860.670914
|
||||
17 biological 3.89 3.384643 1377.5657 878.928314
|
||||
20 multidiscipline 1.72 2.466786 609.1036 591.093271
|
||||
27 security_law_enforcement 0.00 2.109286 0.0000 521.904729
|
||||
14 english 0.97 1.992857 343.5061 458.707082
|
||||
37 history 1.17 1.755714 414.3321 421.027957
|
||||
28 public_administration_social_service 1.09 1.165357 386.0017 286.223114
|
||||
24 physical_science 1.06 0.981071 375.3778 256.140975
|
||||
6 computer 1.44 0.937500 509.9472 248.493907
|
||||
11 language 0.45 0.713929 159.3585 173.877018
|
||||
18 mathematics 0.89 0.571786 315.1757 164.741461
|
||||
22 philosophy_religious 0.18 0.357143 63.7434 89.032021
|
||||
21 parks_recreation_fitness 2.86 0.308929 1012.8118 107.747239
|
||||
3 ethnic_cultural_gender 0.09 0.299286 31.8717 73.607264
|
||||
1 resources 0.38 0.251786 134.5694 77.018761
|
||||
2 architecture 0.00 0.017500 0.0000 2.880346
|
||||
15 humanities 0.00 0.001429 0.0000 0.228757
|
||||
0 agriculture 0.00 0.000000 0.0000 0.000000
|
||||
5 communications_technology 0.00 0.000000 0.0000 0.000000
|
||||
7 personal_culinary 0.00 0.000000 0.0000 0.000000
|
||||
10 engineering_technology 0.00 0.000000 0.0000 0.000000
|
||||
13 legal 0.00 0.000000 0.0000 0.000000
|
||||
19 military 0.00 0.000000 0.0000 0.000000
|
||||
16 library 0.00 0.000000 0.0000 0.000000
|
||||
23 theology_religious_vocation 0.00 0.000000 0.0000 0.000000
|
||||
25 science_technology 0.00 0.000000 0.0000 0.000000
|
||||
33 transportation 0.00 0.000000 0.0000 0.000000
|
||||
32 precision_production 0.00 0.000000 0.0000 0.000000
|
||||
31 mechanic_repair_technology 0.00 0.000000 0.0000 0.000000
|
||||
30 construction 0.00 0.000000 0.0000 0.000000
|
||||
Field All R&D expenditures ... Nonprofit organizations Percent All other sources Percent
|
||||
0 All R&D fields 184880.0 ... 0.948183 0.376460
|
||||
1 Science 92834.0 ... 0.613999 0.187431
|
||||
2 Computer and information sciences 18813.0 ... 0.015946 0.212619
|
||||
3 Geosciences, atmospheric sciences, and ocean s... 16822.0 ... 0.606349 0.095114
|
||||
4 Atmospheric science and meteorology 0.0 ... NaN NaN
|
||||
5 Geological and earth sciences 14248.0 ... 0.715890 0.112296
|
||||
6 Ocean sciences and marine sciences 2574.0 ... 0.000000 0.000000
|
||||
7 Geosciences, atmospheric sciences, and ocean s... 0.0 ... NaN NaN
|
||||
8 Life sciences 23853.0 ... 1.224165 0.314426
|
||||
9 Agricultural sciences 11.0 ... 0.000000 0.000000
|
||||
10 Biological and biomedical sciences 10055.0 ... 0.616609 0.159125
|
||||
11 Health sciences 13736.0 ... 1.674432 0.422248
|
||||
12 Natural resources and conservation 0.0 ... NaN NaN
|
||||
13 Life sciences nec 51.0 ... 0.000000 1.960784
|
||||
14 Mathematics and statistics 2918.0 ... 0.959561 0.000000
|
||||
15 Physical sciences 11124.0 ... 0.107875 0.000000
|
||||
16 Astronomy and astrophysics 271.0 ... 0.000000 0.000000
|
||||
17 Chemistry 7938.0 ... 0.151172 0.000000
|
||||
18 Materials science 0.0 ... NaN NaN
|
||||
19 Physics 2911.0 ... 0.000000 0.000000
|
||||
20 Physical sciences nec 4.0 ... 0.000000 0.000000
|
||||
21 Psychology 5690.0 ... 0.351494 0.000000
|
||||
22 Social sciences 13350.0 ... 0.846442 0.322097
|
||||
23 Anthropology 792.0 ... 1.767677 5.429293
|
||||
24 Economics 3707.0 ... 0.000000 0.000000
|
||||
25 Political science and government 898.0 ... 1.559020 0.000000
|
||||
26 Sociology, demography, and population studies 2908.0 ... 0.000000 0.000000
|
||||
27 Social sciences nec 5045.0 ... 1.684836 0.000000
|
||||
28 Sciences nec 264.0 ... 0.000000 0.000000
|
||||
29 Engineering 51029.0 ... 1.312979 0.850497
|
||||
30 Aerospace, aeronautical, and astronautical eng... 4665.0 ... 0.000000 0.000000
|
||||
31 Bioengineering and biomedical engineering 0.0 ... NaN NaN
|
||||
32 Chemical engineering 6647.0 ... 2.888521 0.000000
|
||||
33 Civil engineering 15198.0 ... 0.782998 1.796289
|
||||
34 Electrical, electronic, and communications eng... 10299.0 ... 3.359549 0.854452
|
||||
35 Industrial and manufacturing engineering 0.0 ... NaN NaN
|
||||
36 Mechanical engineering 7056.0 ... 0.000000 0.255102
|
||||
37 Metallurgical and materials engineering 6473.0 ... 0.000000 0.849683
|
||||
38 Engineering nec 691.0 ... 1.881331 0.000000
|
||||
39 Non-S&E 41017.0 ... 1.250701 0.214545
|
||||
40 Business management and business administration 11919.0 ... 0.092290 0.000000
|
||||
41 Communication and communications technologies 2675.0 ... 0.523364 0.074766
|
||||
42 Education 9731.0 ... 1.459254 0.000000
|
||||
43 Humanities 4813.0 ... 1.059630 1.225847
|
||||
44 Law 2076.0 ... 9.633911 0.000000
|
||||
45 Social work 4557.0 ... 0.987492 0.285275
|
||||
46 Visual and performing arts 2803.0 ... 0.356761 0.499465
|
||||
47 Non-S&E nec 2443.0 ... 1.637331 0.000000
|
||||
|
||||
[48 rows x 14 columns]
|
||||
BIN
plots/The University of Alabama-enrollment-percent.png
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
plots/The University of Alabama-enrollment-students.png
Normal file
|
After Width: | Height: | Size: 61 KiB |
BIN
plots/The University of Alabama-enrollment.png
Normal file
|
After Width: | Height: | Size: 60 KiB |
BIN
plots/dist-students-enrolled-avg.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
plots/dist-students-enrolled-last-year.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
plots/national-enrollment-percent.png
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
plots/national-enrollment-students.png
Normal file
|
After Width: | Height: | Size: 44 KiB |
BIN
plots/national-enrollment.png
Normal file
|
After Width: | Height: | Size: 50 KiB |
50
run.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
import awkward as ak
|
||||
|
||||
import utils
|
||||
import scorecard_tools
|
||||
import excel_tools
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Setup Args
|
||||
parser = utils.get_common_args(prog="Main Data Parser")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output-dir",
|
||||
default="plots",
|
||||
help="Directory to save generated plots",
|
||||
)
|
||||
parser.add_argument("-s", "--school", default="The University of Alabama")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup Logging
|
||||
utils.setup_logging(args.debug)
|
||||
logger = logging.getLogger("DataAnalysis")
|
||||
|
||||
# Load Scorecard
|
||||
scorecard_dir = os.path.join(args.data_dir, "scorecard")
|
||||
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
|
||||
scorecard_file = os.path.join(scorecard_dir, "merged.parquet")
|
||||
|
||||
logger.info(f"Loading College Scorecard data from file {scorecard_file}")
|
||||
scorecard_data = ak.from_parquet(scorecard_file)
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
logger.info("Saving scorecard-only statistics")
|
||||
scorecard_data = scorecard_tools.mask_valid(scorecard_data)
|
||||
scorecard_tools.save_scorecard_stats(args.output_dir, scorecard_data)
|
||||
scorecard_tools.save_scorecard_stats_national(args.output_dir, scorecard_data)
|
||||
scorecard_tools.save_scorecard_stats_school(args.output_dir, scorecard_data, args.school)
|
||||
|
||||
# Print
|
||||
scorecard_tools.print_enrollment_national(scorecard_data)
|
||||
scorecard_tools.print_enrollment(scorecard_data, args.school)
|
||||
|
||||
# Load Excel
|
||||
excel_data = excel_tools.read_excel(os.path.join(args.data_dir, "ua.xlsx"))
|
||||
excel_tools.print_data(excel_data)
|
||||
162
scorecard_tools.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
import awkward as ak
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
|
||||
logger = logging.getLogger("Scorecard Data")
|
||||
|
||||
|
||||
def mask_valid(data):
|
||||
# Strip out NaN and zero placeholders
|
||||
mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
|
||||
mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
|
||||
mask = mask_nan & mask_all_zero
|
||||
data = data[mask]
|
||||
|
||||
return data
|
||||
|
||||
def mask_school(data, name):
|
||||
return data[data.school.name[:, -1] == name][0]
|
||||
|
||||
|
||||
# Percentage enrollment per field over time averaged across the nation
|
||||
def save_scorecard_stats_national(output_dir, data, ext="png"):
|
||||
prog_percentage = data.academics.program_percentage
|
||||
students = data.student.enrollment.undergrad_12_month
|
||||
|
||||
# Percent enrollment
|
||||
for key in prog_percentage.fields:
|
||||
# Find the mean across the nation
|
||||
mean = ak.mean(prog_percentage[key], axis=0)
|
||||
if ak.all(mean < 0.05):
|
||||
continue
|
||||
plt.plot(mean * 100, label=key)
|
||||
plt.xlabel("Year")
|
||||
plt.ylabel("Percent Enrollment")
|
||||
plt.title("Average Enrollment (National)")
|
||||
plt.legend()
|
||||
plt.savefig(os.path.join(output_dir, f"national-enrollment-percent.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
# Students enrolled
|
||||
for key in prog_percentage.fields:
|
||||
# Find the mean across the nation
|
||||
mean_perc = ak.mean(prog_percentage[key], axis=0)
|
||||
if ak.all(mean_perc < 0.05):
|
||||
continue
|
||||
|
||||
mean = ak.mean(prog_percentage[key] * students, axis=0)
|
||||
plt.plot(mean, label=key)
|
||||
plt.xlabel("Year")
|
||||
plt.ylabel("Percent Enrollment")
|
||||
plt.title("Average Enrollment (National)")
|
||||
plt.legend()
|
||||
plt.savefig(os.path.join(output_dir, f"national-enrollment-students.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
|
||||
# Percentage enrollment per field over time averaged across one school
|
||||
def save_scorecard_stats_school(output_dir, data, school: str, ext="png"):
|
||||
data = mask_school(data, school)
|
||||
prog_percentage = data.academics.program_percentage
|
||||
students = data.student.enrollment.undergrad_12_month
|
||||
|
||||
# Plot percent enrollment
|
||||
for key in prog_percentage.fields:
|
||||
# Find the mean across the nation
|
||||
mean = ak.mean(prog_percentage[key], axis=0)
|
||||
if mean < 0.05:
|
||||
continue
|
||||
plt.plot(prog_percentage[key] * 100, label=key)
|
||||
plt.xlabel("Year")
|
||||
plt.ylabel("Percent Enrollment")
|
||||
plt.title(f"Average Enrollment ({school})")
|
||||
plt.legend()
|
||||
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-percent.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
# Plot student enrollment
|
||||
for key in prog_percentage.fields:
|
||||
# Find the mean across the nation
|
||||
mean = ak.mean(prog_percentage[key], axis=0)
|
||||
if mean < 0.05:
|
||||
continue
|
||||
plt.plot(prog_percentage[key] * students, label=key)
|
||||
plt.xlabel("Year")
|
||||
plt.ylabel("Percent Enrollment")
|
||||
plt.title(f"Average Enrollment ({school})")
|
||||
plt.legend()
|
||||
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-students.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
def save_scorecard_stats(output_dir, data, ext="png"):
|
||||
# Students Enrolled Distribution (Average)
|
||||
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
|
||||
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
|
||||
plt.xlabel("Students Enrolled")
|
||||
plt.ylabel("Univeristy Count")
|
||||
plt.title("Enrollemnt Distribution (Average)")
|
||||
plt.yscale("log")
|
||||
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-avg.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
# Students Enrolled Distribution (Last Year)
|
||||
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
|
||||
plt.xlabel("Students Enrolled")
|
||||
plt.ylabel("Univeristy Count")
|
||||
plt.title("Enrollemnt Distribution (Last Year)")
|
||||
plt.yscale("log")
|
||||
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-last-year.{ext}"))
|
||||
plt.clf()
|
||||
|
||||
|
||||
|
||||
# Print national enrollment stats
|
||||
def print_enrollment_national(data):
|
||||
prog_percentage = data.academics.program_percentage
|
||||
students = data.student.enrollment.undergrad_12_month
|
||||
|
||||
# Create dataframe of enrollment across the nation (latest academic year and average)
|
||||
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
|
||||
for key in df['Name']:
|
||||
vals = prog_percentage[key]
|
||||
df['Percent (Last)'].append(ak.mean(vals[-1]) * 100)
|
||||
df['Percent (Avg)'].append(ak.mean(vals) * 100)
|
||||
df['Students (Last)'].append(ak.mean(vals[-1] * students[-1]))
|
||||
df['Students (Avg)'].append(ak.mean(vals * students))
|
||||
|
||||
df = pd.DataFrame(df)
|
||||
|
||||
# Print Dataframe
|
||||
logger.info("\n\nNational Percentage (Last Academic Year): ")
|
||||
print(df.sort_values("Percent (Last)", ascending=False))
|
||||
|
||||
logger.info("\n\nNational Percentage (Avg): ")
|
||||
print(df.sort_values("Percent (Avg)", ascending=False))
|
||||
|
||||
def print_enrollment(data, name: str):
|
||||
data = mask_school(data, name)
|
||||
prog_percentage = data.academics.program_percentage
|
||||
students = data.student.enrollment.undergrad_12_month
|
||||
|
||||
# Create dataframe of enrollment across the nation (latest academic year and average)
|
||||
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
|
||||
for key in df['Name']:
|
||||
vals = prog_percentage[key]
|
||||
df['Percent (Last)'].append(vals[-1] * 100)
|
||||
df['Percent (Avg)'].append(ak.mean(vals) * 100)
|
||||
df['Students (Last)'].append(vals[-1] * students[-1])
|
||||
df['Students (Avg)'].append(ak.mean(vals * students))
|
||||
|
||||
df = pd.DataFrame(df)
|
||||
|
||||
# Print Dataframe
|
||||
logger.info(f"\n\nProgram Percentage ({name}, Last Academic Year): ")
|
||||
print(df.sort_values("Percent (Last)", ascending=False))
|
||||
|
||||
logger.info(f"\n\nProgram Percentage ({name}, Avg): ")
|
||||
print(df.sort_values("Percent (Avg)", ascending=False))
|
||||
61
utils.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
logger = logging.getLogger("utils")
|
||||
|
||||
# Common Args
|
||||
def get_common_args(prog: str):
|
||||
# Setup Args
|
||||
parser = argparse.ArgumentParser(prog=prog)
|
||||
|
||||
# Analysis settings
|
||||
parser.add_argument(
|
||||
"-D",
|
||||
"--data-dir",
|
||||
help="Directory for downloaded data",
|
||||
default=os.environ.get("DATA_DIRECTORY", "data")
|
||||
)
|
||||
|
||||
# Debug
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Enable verbose logging",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
# LOGGING
|
||||
class Formatter(logging.Formatter):
|
||||
grey = "\x1b[37m"
|
||||
yellow = "\x1b[33m"
|
||||
red = "\x1b[31m"
|
||||
bold_red = "\x1b[1;31m"
|
||||
reset = "\x1b[0m"
|
||||
format = "%(asctime)s - %(name)-24s - %(levelname)-7s - %(message)s (%(filename)s:%(lineno)d)"
|
||||
|
||||
FORMATS = {
|
||||
logging.DEBUG: logging.Formatter(grey + format + reset),
|
||||
logging.INFO: logging.Formatter(format),
|
||||
logging.WARNING: logging.Formatter(yellow + format + reset),
|
||||
logging.ERROR: logging.Formatter(red + format + reset),
|
||||
logging.CRITICAL: logging.Formatter(bold_red + format + reset),
|
||||
}
|
||||
|
||||
def format(self, record):
|
||||
return self.FORMATS.get(record.levelno).format(record)
|
||||
|
||||
|
||||
def setup_logging(debug: bool = False):
|
||||
ch = logging.StreamHandler()
|
||||
ch.setFormatter(Formatter())
|
||||
|
||||
logging.basicConfig(
|
||||
handlers=[ch],
|
||||
level=logging.DEBUG if debug else logging.INFO,
|
||||
)
|
||||