Files
undergrad-uh401/explore-basic.ipynb
2025-11-08 05:42:28 +00:00

243 KiB

In [1]:
!pip install -r requirements.txt
Collecting awkward (from -r requirements.txt (line 1))
  Downloading awkward-2.8.10-py3-none-any.whl.metadata (7.5 kB)
Requirement already satisfied: pandas in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.3.0)
Requirement already satisfied: numpy in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 3)) (2.2.6)
Requirement already satisfied: tqdm in /opt/conda/lib/python3.12/site-packages (from -r requirements.txt (line 5)) (4.67.1)
Collecting awkward-cpp==50 (from awkward->-r requirements.txt (line 1))
  Using cached awkward_cpp-50-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (2.2 kB)
Requirement already satisfied: fsspec>=2022.11.0 in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (2025.5.1)
Requirement already satisfied: packaging in /opt/conda/lib/python3.12/site-packages (from awkward->-r requirements.txt (line 1)) (25.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.12/site-packages (from pandas->-r requirements.txt (line 2)) (2025.2)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->-r requirements.txt (line 2)) (1.17.0)
Downloading awkward-2.8.10-py3-none-any.whl (907 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 908.0/908.0 kB 6.0 MB/s eta 0:00:00
Using cached awkward_cpp-50-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (655 kB)
Installing collected packages: awkward-cpp, awkward
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2/2 [awkward]m1/2 [awkward]
Successfully installed awkward-2.8.10 awkward-cpp-50
In [14]:
import os

import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
import pandas as pd
In [29]:
df = pd.read_excel("data/ua.xlsx", header=3)
df = df[:-4][[field for field in df[:-4] if 'Unnamed' not in field]]
key_all = "All R&D expenditures"
key_federal = "Federal government"
key_state = "State and local government"
key_inst = "Institution funds"
key_business = "Business"
key_nonprofit = "Nonprofit organizations"
key_other = "All other sources"
keys = [key_federal, key_state, key_inst, key_business, key_nonprofit, key_other]
for key in keys:
    key_percent = key + " Percent"
    df[key_percent] = df[key] / df[key_all] * 100
df.sort_values(key_all)
df
/opt/conda/lib/python3.12/site-packages/openpyxl/styles/stylesheet.py:237: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")
Out[29]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Field All R&D expenditures Federal government State and local government Institution funds Business Nonprofit organizations All other sources Federal government Percent State and local government Percent Institution funds Percent Business Percent Nonprofit organizations Percent All other sources Percent
0 All R&D fields 184880.0 86244.0 16660.0 77554.0 1973.0 1753.0 696.0 46.648637 9.011251 41.948291 1.067179 0.948183 0.376460
1 Science 92834.0 44257.0 12309.0 34954.0 570.0 570.0 174.0 47.673266 13.259151 37.652153 0.613999 0.613999 0.187431
2 Computer and information sciences 18813.0 5584.0 9591.0 3595.0 0.0 3.0 40.0 29.681603 50.980705 19.109127 0.000000 0.015946 0.212619
3 Geosciences, atmospheric sciences, and ocean s... 16822.0 12947.0 1327.0 2324.0 106.0 102.0 16.0 76.964689 7.888479 13.815242 0.630127 0.606349 0.095114
4 Atmospheric science and meteorology 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
5 Geological and earth sciences 14248.0 12945.0 21.0 1058.0 106.0 102.0 16.0 90.854857 0.147389 7.425604 0.743964 0.715890 0.112296
6 Ocean sciences and marine sciences 2574.0 2.0 1306.0 1266.0 0.0 0.0 0.0 0.077700 50.738151 49.184149 0.000000 0.000000 0.000000
7 Geosciences, atmospheric sciences, and ocean s... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
8 Life sciences 23853.0 11858.0 0.0 11487.0 141.0 292.0 75.0 49.712824 0.000000 48.157464 0.591121 1.224165 0.314426
9 Agricultural sciences 11.0 7.0 0.0 4.0 0.0 0.0 0.0 63.636364 0.000000 36.363636 0.000000 0.000000 0.000000
10 Biological and biomedical sciences 10055.0 4436.0 0.0 5540.0 1.0 62.0 16.0 44.117355 0.000000 55.096967 0.009945 0.616609 0.159125
11 Health sciences 13736.0 7366.0 0.0 5942.0 140.0 230.0 58.0 53.625510 0.000000 43.258591 1.019220 1.674432 0.422248
12 Natural resources and conservation 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
13 Life sciences nec 51.0 49.0 0.0 1.0 0.0 0.0 1.0 96.078431 0.000000 1.960784 0.000000 0.000000 1.960784
14 Mathematics and statistics 2918.0 453.0 1019.0 1418.0 0.0 28.0 0.0 15.524332 34.921179 48.594928 0.000000 0.959561 0.000000
15 Physical sciences 11124.0 7960.0 0.0 3148.0 4.0 12.0 0.0 71.556994 0.000000 28.299173 0.035958 0.107875 0.000000
16 Astronomy and astrophysics 271.0 271.0 0.0 0.0 0.0 0.0 0.0 100.000000 0.000000 0.000000 0.000000 0.000000 0.000000
17 Chemistry 7938.0 5109.0 0.0 2817.0 0.0 12.0 0.0 64.361300 0.000000 35.487528 0.000000 0.151172 0.000000
18 Materials science 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
19 Physics 2911.0 2580.0 0.0 327.0 4.0 0.0 0.0 88.629337 0.000000 11.233253 0.137410 0.000000 0.000000
20 Physical sciences nec 4.0 0.0 0.0 4.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
21 Psychology 5690.0 3001.0 13.0 2656.0 0.0 20.0 0.0 52.741652 0.228471 46.678383 0.000000 0.351494 0.000000
22 Social sciences 13350.0 2454.0 359.0 10062.0 319.0 113.0 43.0 18.382022 2.689139 75.370787 2.389513 0.846442 0.322097
23 Anthropology 792.0 270.0 0.0 465.0 0.0 14.0 43.0 34.090909 0.000000 58.712121 0.000000 1.767677 5.429293
24 Economics 3707.0 0.0 0.0 3707.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
25 Political science and government 898.0 1.0 0.0 883.0 0.0 14.0 0.0 0.111359 0.000000 98.329621 0.000000 1.559020 0.000000
26 Sociology, demography, and population studies 2908.0 84.0 183.0 2641.0 0.0 0.0 0.0 2.888583 6.292985 90.818432 0.000000 0.000000 0.000000
27 Social sciences nec 5045.0 2099.0 176.0 2366.0 319.0 85.0 0.0 41.605550 3.488603 46.897919 6.323092 1.684836 0.000000
28 Sciences nec 264.0 0.0 0.0 264.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
29 Engineering 51029.0 32570.0 3216.0 12977.0 1162.0 670.0 434.0 63.826452 6.302299 25.430637 2.277137 1.312979 0.850497
30 Aerospace, aeronautical, and astronautical eng... 4665.0 3291.0 50.0 1324.0 0.0 0.0 0.0 70.546624 1.071811 28.381565 0.000000 0.000000 0.000000
31 Bioengineering and biomedical engineering 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
32 Chemical engineering 6647.0 3946.0 20.0 2164.0 325.0 192.0 0.0 59.365127 0.300888 32.556040 4.889424 2.888521 0.000000
33 Civil engineering 15198.0 8770.0 2872.0 3068.0 96.0 119.0 273.0 57.704961 18.897223 20.186867 0.631662 0.782998 1.796289
34 Electrical, electronic, and communications eng... 10299.0 7380.0 109.0 1985.0 391.0 346.0 88.0 71.657442 1.058355 19.273716 3.796485 3.359549 0.854452
35 Industrial and manufacturing engineering 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
36 Mechanical engineering 7056.0 4641.0 144.0 2096.0 157.0 0.0 18.0 65.773810 2.040816 29.705215 2.225057 0.000000 0.255102
37 Metallurgical and materials engineering 6473.0 3896.0 21.0 2308.0 193.0 0.0 55.0 60.188475 0.324425 35.655801 2.981616 0.000000 0.849683
38 Engineering nec 691.0 646.0 0.0 32.0 0.0 13.0 0.0 93.487699 0.000000 4.630970 0.000000 1.881331 0.000000
39 Non-S&E 41017.0 9417.0 1135.0 29623.0 241.0 513.0 88.0 22.958773 2.767145 72.221274 0.587561 1.250701 0.214545
40 Business management and business administration 11919.0 1114.0 717.0 9954.0 123.0 11.0 0.0 9.346422 6.015605 83.513718 1.031966 0.092290 0.000000
41 Communication and communications technologies 2675.0 47.0 0.0 2612.0 0.0 14.0 2.0 1.757009 0.000000 97.644860 0.000000 0.523364 0.074766
42 Education 9731.0 4520.0 379.0 4688.0 2.0 142.0 0.0 46.449491 3.894769 48.175933 0.020553 1.459254 0.000000
43 Humanities 4813.0 18.0 0.0 4685.0 0.0 51.0 59.0 0.373987 0.000000 97.340536 0.000000 1.059630 1.225847
44 Law 2076.0 0.0 0.0 1876.0 0.0 200.0 0.0 0.000000 0.000000 90.366089 0.000000 9.633911 0.000000
45 Social work 4557.0 3565.0 20.0 914.0 0.0 45.0 13.0 78.231293 0.438885 20.057055 0.000000 0.987492 0.285275
46 Visual and performing arts 2803.0 0.0 17.0 2762.0 0.0 10.0 14.0 0.000000 0.606493 98.537281 0.000000 0.356761 0.499465
47 Non-S&E nec 2443.0 153.0 2.0 2132.0 116.0 40.0 0.0 6.262792 0.081867 87.269750 4.748260 1.637331 0.000000
In [32]:
df.sort_values(key_inst + " Percent", ascending=False)
Out[32]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Field All R&D expenditures Federal government State and local government Institution funds Business Nonprofit organizations All other sources Federal government Percent State and local government Percent Institution funds Percent Business Percent Nonprofit organizations Percent All other sources Percent
24 Economics 3707.0 0.0 0.0 3707.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
20 Physical sciences nec 4.0 0.0 0.0 4.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
28 Sciences nec 264.0 0.0 0.0 264.0 0.0 0.0 0.0 0.000000 0.000000 100.000000 0.000000 0.000000 0.000000
46 Visual and performing arts 2803.0 0.0 17.0 2762.0 0.0 10.0 14.0 0.000000 0.606493 98.537281 0.000000 0.356761 0.499465
25 Political science and government 898.0 1.0 0.0 883.0 0.0 14.0 0.0 0.111359 0.000000 98.329621 0.000000 1.559020 0.000000
41 Communication and communications technologies 2675.0 47.0 0.0 2612.0 0.0 14.0 2.0 1.757009 0.000000 97.644860 0.000000 0.523364 0.074766
43 Humanities 4813.0 18.0 0.0 4685.0 0.0 51.0 59.0 0.373987 0.000000 97.340536 0.000000 1.059630 1.225847
26 Sociology, demography, and population studies 2908.0 84.0 183.0 2641.0 0.0 0.0 0.0 2.888583 6.292985 90.818432 0.000000 0.000000 0.000000
44 Law 2076.0 0.0 0.0 1876.0 0.0 200.0 0.0 0.000000 0.000000 90.366089 0.000000 9.633911 0.000000
47 Non-S&E nec 2443.0 153.0 2.0 2132.0 116.0 40.0 0.0 6.262792 0.081867 87.269750 4.748260 1.637331 0.000000
40 Business management and business administration 11919.0 1114.0 717.0 9954.0 123.0 11.0 0.0 9.346422 6.015605 83.513718 1.031966 0.092290 0.000000
22 Social sciences 13350.0 2454.0 359.0 10062.0 319.0 113.0 43.0 18.382022 2.689139 75.370787 2.389513 0.846442 0.322097
39 Non-S&E 41017.0 9417.0 1135.0 29623.0 241.0 513.0 88.0 22.958773 2.767145 72.221274 0.587561 1.250701 0.214545
23 Anthropology 792.0 270.0 0.0 465.0 0.0 14.0 43.0 34.090909 0.000000 58.712121 0.000000 1.767677 5.429293
10 Biological and biomedical sciences 10055.0 4436.0 0.0 5540.0 1.0 62.0 16.0 44.117355 0.000000 55.096967 0.009945 0.616609 0.159125
6 Ocean sciences and marine sciences 2574.0 2.0 1306.0 1266.0 0.0 0.0 0.0 0.077700 50.738151 49.184149 0.000000 0.000000 0.000000
14 Mathematics and statistics 2918.0 453.0 1019.0 1418.0 0.0 28.0 0.0 15.524332 34.921179 48.594928 0.000000 0.959561 0.000000
42 Education 9731.0 4520.0 379.0 4688.0 2.0 142.0 0.0 46.449491 3.894769 48.175933 0.020553 1.459254 0.000000
8 Life sciences 23853.0 11858.0 0.0 11487.0 141.0 292.0 75.0 49.712824 0.000000 48.157464 0.591121 1.224165 0.314426
27 Social sciences nec 5045.0 2099.0 176.0 2366.0 319.0 85.0 0.0 41.605550 3.488603 46.897919 6.323092 1.684836 0.000000
21 Psychology 5690.0 3001.0 13.0 2656.0 0.0 20.0 0.0 52.741652 0.228471 46.678383 0.000000 0.351494 0.000000
11 Health sciences 13736.0 7366.0 0.0 5942.0 140.0 230.0 58.0 53.625510 0.000000 43.258591 1.019220 1.674432 0.422248
0 All R&D fields 184880.0 86244.0 16660.0 77554.0 1973.0 1753.0 696.0 46.648637 9.011251 41.948291 1.067179 0.948183 0.376460
1 Science 92834.0 44257.0 12309.0 34954.0 570.0 570.0 174.0 47.673266 13.259151 37.652153 0.613999 0.613999 0.187431
9 Agricultural sciences 11.0 7.0 0.0 4.0 0.0 0.0 0.0 63.636364 0.000000 36.363636 0.000000 0.000000 0.000000
37 Metallurgical and materials engineering 6473.0 3896.0 21.0 2308.0 193.0 0.0 55.0 60.188475 0.324425 35.655801 2.981616 0.000000 0.849683
17 Chemistry 7938.0 5109.0 0.0 2817.0 0.0 12.0 0.0 64.361300 0.000000 35.487528 0.000000 0.151172 0.000000
32 Chemical engineering 6647.0 3946.0 20.0 2164.0 325.0 192.0 0.0 59.365127 0.300888 32.556040 4.889424 2.888521 0.000000
36 Mechanical engineering 7056.0 4641.0 144.0 2096.0 157.0 0.0 18.0 65.773810 2.040816 29.705215 2.225057 0.000000 0.255102
30 Aerospace, aeronautical, and astronautical eng... 4665.0 3291.0 50.0 1324.0 0.0 0.0 0.0 70.546624 1.071811 28.381565 0.000000 0.000000 0.000000
15 Physical sciences 11124.0 7960.0 0.0 3148.0 4.0 12.0 0.0 71.556994 0.000000 28.299173 0.035958 0.107875 0.000000
29 Engineering 51029.0 32570.0 3216.0 12977.0 1162.0 670.0 434.0 63.826452 6.302299 25.430637 2.277137 1.312979 0.850497
33 Civil engineering 15198.0 8770.0 2872.0 3068.0 96.0 119.0 273.0 57.704961 18.897223 20.186867 0.631662 0.782998 1.796289
45 Social work 4557.0 3565.0 20.0 914.0 0.0 45.0 13.0 78.231293 0.438885 20.057055 0.000000 0.987492 0.285275
34 Electrical, electronic, and communications eng... 10299.0 7380.0 109.0 1985.0 391.0 346.0 88.0 71.657442 1.058355 19.273716 3.796485 3.359549 0.854452
2 Computer and information sciences 18813.0 5584.0 9591.0 3595.0 0.0 3.0 40.0 29.681603 50.980705 19.109127 0.000000 0.015946 0.212619
3 Geosciences, atmospheric sciences, and ocean s... 16822.0 12947.0 1327.0 2324.0 106.0 102.0 16.0 76.964689 7.888479 13.815242 0.630127 0.606349 0.095114
19 Physics 2911.0 2580.0 0.0 327.0 4.0 0.0 0.0 88.629337 0.000000 11.233253 0.137410 0.000000 0.000000
5 Geological and earth sciences 14248.0 12945.0 21.0 1058.0 106.0 102.0 16.0 90.854857 0.147389 7.425604 0.743964 0.715890 0.112296
38 Engineering nec 691.0 646.0 0.0 32.0 0.0 13.0 0.0 93.487699 0.000000 4.630970 0.000000 1.881331 0.000000
13 Life sciences nec 51.0 49.0 0.0 1.0 0.0 0.0 1.0 96.078431 0.000000 1.960784 0.000000 0.000000 1.960784
16 Astronomy and astrophysics 271.0 271.0 0.0 0.0 0.0 0.0 0.0 100.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 Atmospheric science and meteorology 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
7 Geosciences, atmospheric sciences, and ocean s... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
12 Natural resources and conservation 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
18 Materials science 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
31 Bioengineering and biomedical engineering 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
35 Industrial and manufacturing engineering 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
In [41]:
data_m = data[mask]
for key in data_m.academics.program_percentage.fields:
    mean = ak.mean(data_m.academics.program_percentage[key], axis=0)
    if ak.mean(mean) < 0.03:
        continue
    plt.plot(mean * 100, label=key)
plt.xlabel("Year")
plt.ylabel("Percent Enrollment")
plt.legend()
Out[41]:
<matplotlib.legend.Legend at 0x7d90e330f4a0>
No description has been provided for this image
In [42]:
avg_students = ak.mean(data.student.enrollment.undergrad_12_month, axis=1)
plt.hist(avg_students, bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Average)")
plt.yscale("log")
No description has been provided for this image
In [43]:
plt.hist(data.student.enrollment.undergrad_12_month[:, -1], bins=np.arange(0, 100_000, 1_000))
plt.xlabel("Students Enrolled")
plt.ylabel("Univeristy Count")
plt.title("Enrollemnt Distribution (Last Year)")
plt.yscale("log")
No description has been provided for this image
In [44]:
plt.hist(data.admissions.admission_rate.overall[:, -1])
Out[44]:
(array([ 35.,  48.,  40.,  75., 118., 158., 227., 334., 389., 522.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 <BarContainer object of 10 artists>)
No description has been provided for this image
In [ ]:
data.school.fields