Files
undergrad-uh401/scorecard_tools.py
2025-12-06 03:59:17 +00:00

162 lines
5.9 KiB
Python

import os
import logging
import awkward as ak
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
logger = logging.getLogger("Scorecard Data")
def mask_valid(data):
# Strip out NaN and zero placeholders
mask_nan = ~ak.any(np.isnan(data.academics.program_percentage.physical_science), axis=1)
mask_all_zero = ~ak.all(data.academics.program_percentage.physical_science == 0, axis=1)
mask = mask_nan & mask_all_zero
data = data[mask]
return data
def mask_school(data, name):
return data[data.school.name[:, -1] == name][0]
# Percentage enrollment per field over time averaged across the nation
def save_scorecard_stats_national(output_dir, data, ext="png"):
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Percent enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if ak.all(mean < 0.05):
continue
plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.savefig(os.path.join(output_dir, f"national-enrollment-percent.{ext}"))
plt.clf()
# Students enrolled
for key in prog_percentage.fields:
# Find the mean across the nation
mean_perc = ak.mean(prog_percentage[key], axis=0)
if ak.all(mean_perc < 0.05):
continue
mean = ak.mean(prog_percentage[key] * students, axis=0)
plt.plot(mean, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title("Average Enrollment (National)")
plt.legend()
plt.savefig(os.path.join(output_dir, f"national-enrollment-students.{ext}"))
plt.clf()
# Percentage enrollment per field over time averaged across one school
def save_scorecard_stats_school(output_dir, data, school: str, ext="png"):
data = mask_school(data, school)
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Plot percent enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if mean < 0.05:
continue
plt.plot(prog_percentage[key] * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title(f"Average Enrollment ({school})")
plt.legend()
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-percent.{ext}"))
plt.clf()
# Plot student enrollment
for key in prog_percentage.fields:
# Find the mean across the nation
mean = ak.mean(prog_percentage[key], axis=0)
if mean < 0.05:
continue
plt.plot(prog_percentage[key] * students, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.title(f"Average Enrollment ({school})")
plt.legend()
plt.savefig(os.path.join(output_dir, f"{school}-enrollment-students.{ext}"))
plt.clf()
def save_scorecard_stats(output_dir, data, ext="png"):
# Students Enrolled Distribution (Average)
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-avg.{ext}"))
plt.clf()
# Students Enrolled Distribution (Last Year)
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")
plt.savefig(os.path.join(output_dir, f"dist-students-enrolled-last-year.{ext}"))
plt.clf()
# Print national enrollment stats
def print_enrollment_national(data):
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Create dataframe of enrollment across the nation (latest academic year and average)
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
for key in df['Name']:
vals = prog_percentage[key]
df['Percent (Last)'].append(ak.mean(vals[-1]) * 100)
df['Percent (Avg)'].append(ak.mean(vals) * 100)
df['Students (Last)'].append(ak.mean(vals[-1] * students[-1]))
df['Students (Avg)'].append(ak.mean(vals * students))
df = pd.DataFrame(df)
# Print Dataframe
logger.info("\n\nNational Percentage (Last Academic Year): ")
print(df.sort_values("Percent (Last)", ascending=False))
logger.info("\n\nNational Percentage (Avg): ")
print(df.sort_values("Percent (Avg)", ascending=False))
def print_enrollment(data, name: str):
data = mask_school(data, name)
prog_percentage = data.academics.program_percentage
students = data.student.enrollment.undergrad_12_month
# Create dataframe of enrollment across the nation (latest academic year and average)
df = {'Name': list(prog_percentage.fields), 'Percent (Last)': [], 'Percent (Avg)': [], "Students (Last)": [], "Students (Avg)": [] }
for key in df['Name']:
vals = prog_percentage[key]
df['Percent (Last)'].append(vals[-1] * 100)
df['Percent (Avg)'].append(ak.mean(vals) * 100)
df['Students (Last)'].append(vals[-1] * students[-1])
df['Students (Avg)'].append(ak.mean(vals * students))
df = pd.DataFrame(df)
# Print Dataframe
logger.info(f"\n\nProgram Percentage ({name}, Last Academic Year): ")
print(df.sort_values("Percent (Last)", ascending=False))
logger.info(f"\n\nProgram Percentage ({name}, Avg): ")
print(df.sort_values("Percent (Avg)", ascending=False))