281 KiB
281 KiB
In [1]:
!pip install -r requirements.txt
In [30]:
import os
import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
import pandas as pd
In [3]:
scorecard_dir = "data/scorecard"
scorecard_dir = os.path.join(scorecard_dir, os.listdir(scorecard_dir)[0])
data = ak.from_parquet(os.path.join(scorecard_dir, "merged.parquet"))
In [118]:
mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
mask = mask_nan & mask_all_zero
data_m = data[mask]
data_ua = data[data.school.name[:, -1] == "The University of Alabama"][0]
National Stats¶
In [119]:
for key in data_m.academics.program_percentage.fields:
mean = ak.mean(data_m.academics.program_percentage[key], axis=0)
if ak.all(mean < 0.03):
continue
plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.show()
In [139]:
# Mean, nationwide, across all years
vals = {}
percents = data_m.academics.program_percentage
for key in percents.fields:
vals[key] = ak.mean(percents[key]) * 100
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
df.sort_values("Percent", ascending=False)
Out[139]:
In [140]:
# Mean, nationwide, across latest year
vals = {}
percents = data_m.academics.program_percentage
for key in percents.fields:
vals[key] = ak.mean(percents[key, -1]) * 100
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
df.sort_values("Percent", ascending=False)
Out[140]:
Enrollment, UA¶
In [141]:
plt.clf()
for key in data_ua.academics.program_percentage.fields:
vals = data_ua.academics.program_percentage[key]
if ak.sum(vals) == 0:
continue
if ak.all(vals < 0.05):
continue
plt.plot(vals * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (UA)")
plt.legend()
plt.show()
vals = {}
data_m = data_ua.academics.program_percentage
for key in data_m.fields:
vals[key] = data_m[key][-1] * 100
df = pd.DataFrame({'Name': vals.keys(), 'Percent': vals.values() })
display(df.sort_values("Percent", ascending=False))
display(sum(df['Percent']))
Misc. Stats (Fun)¶
In [142]:
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")
In [143]:
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")
In [144]:
plt.hist(data.admissions.admission_rate.overall[:, -1])
Out[144]:
In [145]:
data.school.fields
Out[145]: