undergrad-uh401/explore-basic.ipynb at a6e6a51041e9883c8aa2e068d1e4269528ea1423

In [1]:

!pip install -r requirements.txt

Collecting awkward (from -r requirements.txt (line 1))
  Downloading awkward-2.8.10-py3-none-any.whl.metadata (7.5 kB)
Requirement already satisfied: pandas in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.3.0)
Requirement already satisfied: numpy in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 3)) (2.2.6)
Requirement already satisfied: tqdm in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 5)) (4.67.1)
Collecting awkward-cpp==50 (from awkward->-r requirements.txt (line 1))
  Using cached awkward_cpp-50-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (2.2 kB)
Requirement already satisfied: fsspec>=2022.11.0 in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (2025.5.1)
Requirement already satisfied: packaging in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (25.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->-r requirements.txt (line 2)) (1.17.0)
Downloading awkward-2.8.10-py3-none-any.whl (907 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 908.0/908.0 kB 6.0 MB/s eta 0:00:00
Using cached awkward_cpp-50-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (655 kB)
Installing collected packages: awkward-cpp, awkward
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2/2 [awkward]m1/2 [awkward]
Successfully installed awkward-2.8.10 awkward-cpp-50

In [14]:

import os

import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
import pandas as pd

In [29]:

df = pd.read_excel("data/ua.xlsx", header=3)
df = df[:-4][[field for field in df[:-4] if 'Unnamed' not in field]]
key_all = "All R&D expenditures"
key_federal = "Federal government"
key_state = "State and local government"
key_inst = "Institution funds"
key_business = "Business"
key_nonprofit = "Nonprofit organizations"
key_other = "All other sources"
keys = [key_federal, key_state, key_inst, key_business, key_nonprofit, key_other]
for key in keys:
    key_percent = key + " Percent"
    df[key_percent] = df[key] / df[key_all] * 100
df.sort_values(key_all)
df

/opt/conda/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

Out[29]:

	Field	All R&D expenditures	Federal government	State and local government	Institution funds	Business	Nonprofit organizations	All other sources	Federal government Percent	State and local government Percent	Institution funds Percent	Business Percent	Nonprofit organizations Percent	All other sources Percent
0	All R&D fields	184880.0	86244.0	16660.0	77554.0	1973.0	1753.0	696.0	46.648637	9.011251	41.948291	1.067179	0.948183	0.376460
1	Science	92834.0	44257.0	12309.0	34954.0	570.0	570.0	174.0	47.673266	13.259151	37.652153	0.613999	0.613999	0.187431
2	Computer and information sciences	18813.0	5584.0	9591.0	3595.0	0.0	3.0	40.0	29.681603	50.980705	19.109127	0.000000	0.015946	0.212619
3	Geosciences, atmospheric sciences, and ocean s...	16822.0	12947.0	1327.0	2324.0	106.0	102.0	16.0	76.964689	7.888479	13.815242	0.630127	0.606349	0.095114
4	Atmospheric science and meteorology	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
5	Geological and earth sciences	14248.0	12945.0	21.0	1058.0	106.0	102.0	16.0	90.854857	0.147389	7.425604	0.743964	0.715890	0.112296
6	Ocean sciences and marine sciences	2574.0	2.0	1306.0	1266.0	0.0	0.0	0.0	0.077700	50.738151	49.184149	0.000000	0.000000	0.000000
7	Geosciences, atmospheric sciences, and ocean s...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
8	Life sciences	23853.0	11858.0	0.0	11487.0	141.0	292.0	75.0	49.712824	0.000000	48.157464	0.591121	1.224165	0.314426
9	Agricultural sciences	11.0	7.0	0.0	4.0	0.0	0.0	0.0	63.636364	0.000000	36.363636	0.000000	0.000000	0.000000
10	Biological and biomedical sciences	10055.0	4436.0	0.0	5540.0	1.0	62.0	16.0	44.117355	0.000000	55.096967	0.009945	0.616609	0.159125
11	Health sciences	13736.0	7366.0	0.0	5942.0	140.0	230.0	58.0	53.625510	0.000000	43.258591	1.019220	1.674432	0.422248
12	Natural resources and conservation	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
13	Life sciences nec	51.0	49.0	0.0	1.0	0.0	0.0	1.0	96.078431	0.000000	1.960784	0.000000	0.000000	1.960784
14	Mathematics and statistics	2918.0	453.0	1019.0	1418.0	0.0	28.0	0.0	15.524332	34.921179	48.594928	0.000000	0.959561	0.000000
15	Physical sciences	11124.0	7960.0	0.0	3148.0	4.0	12.0	0.0	71.556994	0.000000	28.299173	0.035958	0.107875	0.000000
16	Astronomy and astrophysics	271.0	271.0	0.0	0.0	0.0	0.0	0.0	100.000000	0.000000	0.000000	0.000000	0.000000	0.000000
17	Chemistry	7938.0	5109.0	0.0	2817.0	0.0	12.0	0.0	64.361300	0.000000	35.487528	0.000000	0.151172	0.000000
18	Materials science	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
19	Physics	2911.0	2580.0	0.0	327.0	4.0	0.0	0.0	88.629337	0.000000	11.233253	0.137410	0.000000	0.000000
20	Physical sciences nec	4.0	0.0	0.0	4.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
21	Psychology	5690.0	3001.0	13.0	2656.0	0.0	20.0	0.0	52.741652	0.228471	46.678383	0.000000	0.351494	0.000000
22	Social sciences	13350.0	2454.0	359.0	10062.0	319.0	113.0	43.0	18.382022	2.689139	75.370787	2.389513	0.846442	0.322097
23	Anthropology	792.0	270.0	0.0	465.0	0.0	14.0	43.0	34.090909	0.000000	58.712121	0.000000	1.767677	5.429293
24	Economics	3707.0	0.0	0.0	3707.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
25	Political science and government	898.0	1.0	0.0	883.0	0.0	14.0	0.0	0.111359	0.000000	98.329621	0.000000	1.559020	0.000000
26	Sociology, demography, and population studies	2908.0	84.0	183.0	2641.0	0.0	0.0	0.0	2.888583	6.292985	90.818432	0.000000	0.000000	0.000000
27	Social sciences nec	5045.0	2099.0	176.0	2366.0	319.0	85.0	0.0	41.605550	3.488603	46.897919	6.323092	1.684836	0.000000
28	Sciences nec	264.0	0.0	0.0	264.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
29	Engineering	51029.0	32570.0	3216.0	12977.0	1162.0	670.0	434.0	63.826452	6.302299	25.430637	2.277137	1.312979	0.850497
30	Aerospace, aeronautical, and astronautical eng...	4665.0	3291.0	50.0	1324.0	0.0	0.0	0.0	70.546624	1.071811	28.381565	0.000000	0.000000	0.000000
31	Bioengineering and biomedical engineering	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
32	Chemical engineering	6647.0	3946.0	20.0	2164.0	325.0	192.0	0.0	59.365127	0.300888	32.556040	4.889424	2.888521	0.000000
33	Civil engineering	15198.0	8770.0	2872.0	3068.0	96.0	119.0	273.0	57.704961	18.897223	20.186867	0.631662	0.782998	1.796289
34	Electrical, electronic, and communications eng...	10299.0	7380.0	109.0	1985.0	391.0	346.0	88.0	71.657442	1.058355	19.273716	3.796485	3.359549	0.854452
35	Industrial and manufacturing engineering	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
36	Mechanical engineering	7056.0	4641.0	144.0	2096.0	157.0	0.0	18.0	65.773810	2.040816	29.705215	2.225057	0.000000	0.255102
37	Metallurgical and materials engineering	6473.0	3896.0	21.0	2308.0	193.0	0.0	55.0	60.188475	0.324425	35.655801	2.981616	0.000000	0.849683
38	Engineering nec	691.0	646.0	0.0	32.0	0.0	13.0	0.0	93.487699	0.000000	4.630970	0.000000	1.881331	0.000000
39	Non-S&E	41017.0	9417.0	1135.0	29623.0	241.0	513.0	88.0	22.958773	2.767145	72.221274	0.587561	1.250701	0.214545
40	Business management and business administration	11919.0	1114.0	717.0	9954.0	123.0	11.0	0.0	9.346422	6.015605	83.513718	1.031966	0.092290	0.000000
41	Communication and communications technologies	2675.0	47.0	0.0	2612.0	0.0	14.0	2.0	1.757009	0.000000	97.644860	0.000000	0.523364	0.074766
42	Education	9731.0	4520.0	379.0	4688.0	2.0	142.0	0.0	46.449491	3.894769	48.175933	0.020553	1.459254	0.000000
43	Humanities	4813.0	18.0	0.0	4685.0	0.0	51.0	59.0	0.373987	0.000000	97.340536	0.000000	1.059630	1.225847
44	Law	2076.0	0.0	0.0	1876.0	0.0	200.0	0.0	0.000000	0.000000	90.366089	0.000000	9.633911	0.000000
45	Social work	4557.0	3565.0	20.0	914.0	0.0	45.0	13.0	78.231293	0.438885	20.057055	0.000000	0.987492	0.285275
46	Visual and performing arts	2803.0	0.0	17.0	2762.0	0.0	10.0	14.0	0.000000	0.606493	98.537281	0.000000	0.356761	0.499465
47	Non-S&E nec	2443.0	153.0	2.0	2132.0	116.0	40.0	0.0	6.262792	0.081867	87.269750	4.748260	1.637331	0.000000

In [32]:

df.sort_values(key_inst + " Percent", ascending=False)

Out[32]:

	Field	All R&D expenditures	Federal government	State and local government	Institution funds	Business	Nonprofit organizations	All other sources	Federal government Percent	State and local government Percent	Institution funds Percent	Business Percent	Nonprofit organizations Percent	All other sources Percent
24	Economics	3707.0	0.0	0.0	3707.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
20	Physical sciences nec	4.0	0.0	0.0	4.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
28	Sciences nec	264.0	0.0	0.0	264.0	0.0	0.0	0.0	0.000000	0.000000	100.000000	0.000000	0.000000	0.000000
46	Visual and performing arts	2803.0	0.0	17.0	2762.0	0.0	10.0	14.0	0.000000	0.606493	98.537281	0.000000	0.356761	0.499465
25	Political science and government	898.0	1.0	0.0	883.0	0.0	14.0	0.0	0.111359	0.000000	98.329621	0.000000	1.559020	0.000000
41	Communication and communications technologies	2675.0	47.0	0.0	2612.0	0.0	14.0	2.0	1.757009	0.000000	97.644860	0.000000	0.523364	0.074766
43	Humanities	4813.0	18.0	0.0	4685.0	0.0	51.0	59.0	0.373987	0.000000	97.340536	0.000000	1.059630	1.225847
26	Sociology, demography, and population studies	2908.0	84.0	183.0	2641.0	0.0	0.0	0.0	2.888583	6.292985	90.818432	0.000000	0.000000	0.000000
44	Law	2076.0	0.0	0.0	1876.0	0.0	200.0	0.0	0.000000	0.000000	90.366089	0.000000	9.633911	0.000000
47	Non-S&E nec	2443.0	153.0	2.0	2132.0	116.0	40.0	0.0	6.262792	0.081867	87.269750	4.748260	1.637331	0.000000
40	Business management and business administration	11919.0	1114.0	717.0	9954.0	123.0	11.0	0.0	9.346422	6.015605	83.513718	1.031966	0.092290	0.000000
22	Social sciences	13350.0	2454.0	359.0	10062.0	319.0	113.0	43.0	18.382022	2.689139	75.370787	2.389513	0.846442	0.322097
39	Non-S&E	41017.0	9417.0	1135.0	29623.0	241.0	513.0	88.0	22.958773	2.767145	72.221274	0.587561	1.250701	0.214545
23	Anthropology	792.0	270.0	0.0	465.0	0.0	14.0	43.0	34.090909	0.000000	58.712121	0.000000	1.767677	5.429293
10	Biological and biomedical sciences	10055.0	4436.0	0.0	5540.0	1.0	62.0	16.0	44.117355	0.000000	55.096967	0.009945	0.616609	0.159125
6	Ocean sciences and marine sciences	2574.0	2.0	1306.0	1266.0	0.0	0.0	0.0	0.077700	50.738151	49.184149	0.000000	0.000000	0.000000
14	Mathematics and statistics	2918.0	453.0	1019.0	1418.0	0.0	28.0	0.0	15.524332	34.921179	48.594928	0.000000	0.959561	0.000000
42	Education	9731.0	4520.0	379.0	4688.0	2.0	142.0	0.0	46.449491	3.894769	48.175933	0.020553	1.459254	0.000000
8	Life sciences	23853.0	11858.0	0.0	11487.0	141.0	292.0	75.0	49.712824	0.000000	48.157464	0.591121	1.224165	0.314426
27	Social sciences nec	5045.0	2099.0	176.0	2366.0	319.0	85.0	0.0	41.605550	3.488603	46.897919	6.323092	1.684836	0.000000
21	Psychology	5690.0	3001.0	13.0	2656.0	0.0	20.0	0.0	52.741652	0.228471	46.678383	0.000000	0.351494	0.000000
11	Health sciences	13736.0	7366.0	0.0	5942.0	140.0	230.0	58.0	53.625510	0.000000	43.258591	1.019220	1.674432	0.422248
0	All R&D fields	184880.0	86244.0	16660.0	77554.0	1973.0	1753.0	696.0	46.648637	9.011251	41.948291	1.067179	0.948183	0.376460
1	Science	92834.0	44257.0	12309.0	34954.0	570.0	570.0	174.0	47.673266	13.259151	37.652153	0.613999	0.613999	0.187431
9	Agricultural sciences	11.0	7.0	0.0	4.0	0.0	0.0	0.0	63.636364	0.000000	36.363636	0.000000	0.000000	0.000000
37	Metallurgical and materials engineering	6473.0	3896.0	21.0	2308.0	193.0	0.0	55.0	60.188475	0.324425	35.655801	2.981616	0.000000	0.849683
17	Chemistry	7938.0	5109.0	0.0	2817.0	0.0	12.0	0.0	64.361300	0.000000	35.487528	0.000000	0.151172	0.000000
32	Chemical engineering	6647.0	3946.0	20.0	2164.0	325.0	192.0	0.0	59.365127	0.300888	32.556040	4.889424	2.888521	0.000000
36	Mechanical engineering	7056.0	4641.0	144.0	2096.0	157.0	0.0	18.0	65.773810	2.040816	29.705215	2.225057	0.000000	0.255102
30	Aerospace, aeronautical, and astronautical eng...	4665.0	3291.0	50.0	1324.0	0.0	0.0	0.0	70.546624	1.071811	28.381565	0.000000	0.000000	0.000000
15	Physical sciences	11124.0	7960.0	0.0	3148.0	4.0	12.0	0.0	71.556994	0.000000	28.299173	0.035958	0.107875	0.000000
29	Engineering	51029.0	32570.0	3216.0	12977.0	1162.0	670.0	434.0	63.826452	6.302299	25.430637	2.277137	1.312979	0.850497
33	Civil engineering	15198.0	8770.0	2872.0	3068.0	96.0	119.0	273.0	57.704961	18.897223	20.186867	0.631662	0.782998	1.796289
45	Social work	4557.0	3565.0	20.0	914.0	0.0	45.0	13.0	78.231293	0.438885	20.057055	0.000000	0.987492	0.285275
34	Electrical, electronic, and communications eng...	10299.0	7380.0	109.0	1985.0	391.0	346.0	88.0	71.657442	1.058355	19.273716	3.796485	3.359549	0.854452
2	Computer and information sciences	18813.0	5584.0	9591.0	3595.0	0.0	3.0	40.0	29.681603	50.980705	19.109127	0.000000	0.015946	0.212619
3	Geosciences, atmospheric sciences, and ocean s...	16822.0	12947.0	1327.0	2324.0	106.0	102.0	16.0	76.964689	7.888479	13.815242	0.630127	0.606349	0.095114
19	Physics	2911.0	2580.0	0.0	327.0	4.0	0.0	0.0	88.629337	0.000000	11.233253	0.137410	0.000000	0.000000
5	Geological and earth sciences	14248.0	12945.0	21.0	1058.0	106.0	102.0	16.0	90.854857	0.147389	7.425604	0.743964	0.715890	0.112296
38	Engineering nec	691.0	646.0	0.0	32.0	0.0	13.0	0.0	93.487699	0.000000	4.630970	0.000000	1.881331	0.000000
13	Life sciences nec	51.0	49.0	0.0	1.0	0.0	0.0	1.0	96.078431	0.000000	1.960784	0.000000	0.000000	1.960784
16	Astronomy and astrophysics	271.0	271.0	0.0	0.0	0.0	0.0	0.0	100.000000	0.000000	0.000000	0.000000	0.000000	0.000000
4	Atmospheric science and meteorology	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
7	Geosciences, atmospheric sciences, and ocean s...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
12	Natural resources and conservation	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
18	Materials science	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
31	Bioengineering and biomedical engineering	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
35	Industrial and manufacturing engineering	0.0	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN

In [41]:

data_m = data[mask]
for key in data_m.academics.program_percentage.fields:
    mean = ak.mean(data_m.academics.program_percentage[key], axis=0)
    if ak.mean(mean) < 0.03:
        continue
    plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.legend()

Out[41]:

<matplotlib.legend.Legend at 0x7d90e330f4a0>

No description has been provided for this image

In [42]:

avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")

In [43]:

plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")

In [44]:

plt.hist(data.admissions.admission_rate.overall[:, -1])

Out[44]:

(array([ 35.,  48.,  40.,  75., 118., 158., 227., 334., 389., 522.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 <BarContainer object of 10 artists>)

In [ ]:

data.school.fields

243 KiB Raw Blame History

243 KiB

Raw Blame History