Example script and output before putting it into a Quarto report.
import pandas as pdfrom plotnine import ggplot, aes, geom_pointfrom palmerpenguins import load_penguinsimport seaborn as snspenguins = load_penguins()penguins.shape
(344, 8)
# drop rows with missing valuespenguins = penguins.dropna()penguins.shape
(333, 8)
# gentoo is easier to separate# eda to find variables to separate Adelie# bill_depth_mm and body_mass_g are good separators# bill_depth_mm and flipper_length_mm are good separators# body_mass_g and flipper_length_mm are NOT good separatorssns.pairplot(penguins, hue="species")
# fit logistic regression modelimport statsmodels.api as smimport statsmodels.formula.api as smf# recode variablepenguins['is_adelie'] = (penguins['species'] =='Adelie').astype(int)# logistic regressionmodel = smf.logit("is_adelie ~ bill_depth_mm + body_mass_g + flipper_length_mm", data=penguins).fit()print(model.summary())
# predicted probabilities for first few rowspenguins["predicted"] = model.predict()penguins[["species", "predicted"]].head()
species
predicted
0
Adelie
0.964279
1
Adelie
0.858842
2
Adelie
0.482890
4
Adelie
0.722669
5
Adelie
0.898866
penguins['pred_prob'] = model.predict()
from plotnine import*( ggplot(penguins, aes(x='species', y='pred_prob')) + geom_boxplot(fill='#56B4E9') + labs(title="Predicted Probability of Gentoo by Species", y="Predicted Probability", x="Species") + theme_minimal())