Files
undergrad-uh401/explore.ipynb
2025-11-08 05:42:28 +00:00

281 KiB

In [1]:
!pip install -r requirements.txt
Requirement already satisfied: awkward in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 1)) (2.8.10)
Requirement already satisfied: pandas in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.3.0)
Requirement already satisfied: numpy in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 3)) (2.2.6)
Requirement already satisfied: tqdm in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 5)) (4.67.1)
Requirement already satisfied: awkward-cpp==50 in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (50)
Requirement already satisfied: fsspec>=2022.11.0 in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (2025.5.1)
Requirement already satisfied: packaging in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (25.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->-r requirements.txt (line 2)) (1.17.0)
In [30]:
import os

import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
import pandas as pd
In [3]:
scorecard_dir = "data/scorecard"
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
data = ak.from_parquet(os.path.join(scorecard_dir, "merged.parquet"))
In [118]:
mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
mask = mask_nan & mask_all_zero
data_m = data[mask]

data_ua = data[data.school.name[:, -1] == "The University of Alabama"][0]

National Stats

In [119]:
for key in data_m.academics.program_percentage.fields:
    mean = ak.mean(data_m.academics.program_percentage[key], axis=0)
    if ak.all(mean < 0.03):
        continue
    plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.show()
No description has been provided for this image
In [139]:
# Mean, nationwide, across all years
vals = {}
percents = data_m.academics.program_percentage
for key in percents.fields:
    vals[key] = ak.mean(percents[key]) * 100
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
df.sort_values("Percent", ascending=False)
Out[139]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Name Percent
36 business_marketing 17.399856
35 health 12.027962
15 humanities 9.546438
8 education 6.705871
29 social_science 6.253486
26 psychology 5.195744
17 biological 4.908001
34 visual_performing 3.903754
4 communication 3.192944
27 security_law_enforcement 3.019295
6 computer 2.808726
9 engineering 2.803155
14 english 2.671756
20 multidiscipline 1.964365
21 parks_recreation_fitness 1.885363
37 history 1.670524
28 public_administration_social_service 1.601816
24 physical_science 1.598245
10 engineering_technology 1.377941
12 family_consumer_science 1.222658
18 mathematics 1.048517
11 language 0.951896
0 agriculture 0.819862
31 mechanic_repair_technology 0.811160
1 resources 0.700875
22 philosophy_religious 0.677424
7 personal_culinary 0.444786
23 theology_religious_vocation 0.432991
32 precision_production 0.379921
3 ethnic_cultural_gender 0.368425
13 legal 0.366860
33 transportation 0.331288
30 construction 0.317400
2 architecture 0.243856
5 communications_technology 0.215724
25 science_technology 0.080646
19 military 0.035292
16 library 0.012911
In [140]:
# Mean, nationwide, across latest year
vals = {}
percents = data_m.academics.program_percentage
for key in percents.fields:
    vals[key] = ak.mean(percents[key, -1]) * 100
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
df.sort_values("Percent", ascending=False)
Out[140]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Name Percent
35 health 22.149643
36 business_marketing 14.511786
15 humanities 10.181786
8 education 7.426786
6 computer 7.148571
31 mechanic_repair_technology 6.220357
27 security_law_enforcement 4.961071
10 engineering_technology 3.409286
20 multidiscipline 2.957857
12 family_consumer_science 2.880357
32 precision_production 2.701429
17 biological 2.671786
29 social_science 2.312143
28 public_administration_social_service 1.677500
26 psychology 1.285714
4 communication 1.136786
9 engineering 0.972143
34 visual_performing 0.807500
7 personal_culinary 0.698571
11 language 0.633929
18 mathematics 0.611071
21 parks_recreation_fitness 0.593214
30 construction 0.573571
13 legal 0.529286
24 physical_science 0.421429
14 english 0.237500
37 history 0.208929
22 philosophy_religious 0.072500
3 ethnic_cultural_gender 0.007500
1 resources 0.000000
5 communications_technology 0.000000
2 architecture 0.000000
0 agriculture 0.000000
16 library 0.000000
19 military 0.000000
25 science_technology 0.000000
23 theology_religious_vocation 0.000000
33 transportation 0.000000

Enrollment, UA

In [141]:
plt.clf()
for key in data_ua.academics.program_percentage.fields:
    vals = data_ua.academics.program_percentage[key]
    if ak.sum(vals) == 0:
        continue
    if ak.all(vals < 0.05):
        continue

    plt.plot(vals * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (UA)")
plt.legend()
plt.show()


vals = {}
data_m = data_ua.academics.program_percentage
for key in data_m.fields:
    vals[key] = data_m[key][-1] * 100
    
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
display(df.sort_values("Percent", ascending=False))
display(sum(df['Percent']))
No description has been provided for this image
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Name Percent
36 business_marketing 30.11
9 engineering 9.66
35 health 9.66
4 communication 9.42
29 social_science 7.99
12 family_consumer_science 6.96
26 psychology 5.13
17 biological 3.89
21 parks_recreation_fitness 2.86
8 education 2.72
34 visual_performing 2.17
20 multidiscipline 1.72
6 computer 1.44
37 history 1.17
28 public_administration_social_service 1.09
24 physical_science 1.06
14 english 0.97
18 mathematics 0.89
11 language 0.45
1 resources 0.38
22 philosophy_religious 0.18
3 ethnic_cultural_gender 0.09
5 communications_technology 0.00
2 architecture 0.00
0 agriculture 0.00
16 library 0.00
10 engineering_technology 0.00
7 personal_culinary 0.00
15 humanities 0.00
13 legal 0.00
19 military 0.00
27 security_law_enforcement 0.00
23 theology_religious_vocation 0.00
25 science_technology 0.00
33 transportation 0.00
32 precision_production 0.00
31 mechanic_repair_technology 0.00
30 construction 0.00
100.00999999999999

Misc. Stats (Fun)

In [142]:
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")
No description has been provided for this image
In [143]:
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")
No description has been provided for this image
In [144]:
plt.hist(data.admissions.admission_rate.overall[:, -1])
Out[144]:
(array([ 35.,  48.,  40.,  75., 118., 158., 227., 334., 389., 522.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 <BarContainer object of 10 artists>)
No description has been provided for this image
In [145]:
data.school.fields
Out[145]:
['name',
 'city',
 'state',
 'zip',
 'accreditor',
 'school_url',
 'price_calculator_url',
 'degrees_awarded',
 'under_investigation',
 'main_campus',
 'branches',
 'ownership',
 'state_fips',
 'region_id',
 'locale',
 'degree_urbanization',
 'carnegie_basic',
 'carnegie_undergrad',
 'carnegie_size_setting',
 'minority_serving',
 'men_only',
 'women_only',
 'religious_affiliation',
 'online_only',
 'operating',
 'tuition_revenue_per_fte',
 'instructional_expenditure_per_fte',
 'faculty_salary',
 'ft_faculty_rate',
 'alias',
 'institutional_characteristics',
 'open_admissions_policy',
 'accreditor_code',
 'title_iv',
 'ownership_peps',
 'endowment',
 'dolflag',
 'peps_ownership',
 'address',
 'sector']