Custom Features and Transformed Target Variables¶

This notebook demonstrates how to customize AGNBoost to your data needs through:

Custom feature creation.
Applying transformations to your target variables.

Let's start by importing the necessary libraries and loading our data.

In [2]:

Copied!





# Set agnboost folder as root
import os

# Navigate to the repository root (parent directory of notebooks/)
os.chdir('..')

# Verify we're in the right place
print(f"Current directory: {os.getcwd()}")
print(f"Contents: {os.listdir('.')}")

# Import necessary libraries
import numpy as np
import pandas as pd
from agnboost import dataset, model
#from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(123)

print("AGNBoost Basic Usage Tutorial")
print("=" * 40)
# Set agnboost folder as root
import os

# Navigate to the repository root (parent directory of notebooks/)
os.chdir('..')

# Verify we're in the right place
print(f"Current directory: {os.getcwd()}")
print(f"Contents: {os.listdir('.')}")

# Import necessary libraries
import numpy as np
import pandas as pd
from agnboost import dataset, model
#from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(123)

print("AGNBoost Basic Usage Tutorial")
print("=" * 40)

Current directory: /home/kurt/Documents/agnboost
Contents: ['pyproject.toml', 'README.md', 'models', 'notebooks', 'figures', '.gitignore', '.env', 'mkdocs.yml', '.github', 'data', 'models_all', 'docs', 'LICENSE', 'tests', '.git', 'agnboost']

2025-05-31 20:31:50.184 | INFO     | agnboost.config:<module>:11 - PROJ_ROOT path is: /home/kurt/Documents/agnboost

AGNBoost Basic Usage Tutorial
========================================

Loading the Data¶

We'll use the Catalog class to load our astronomical dataset. The models-block-0.fits file contains photometric measurements and AGN fraction labels for our analysis. We will load it and print out the data summary so we can easily see all the columns in the data.

In [3]:

Copied!

# Load the astronomical data using the Catalog class
catalog = dataset.Catalog(path="data/cigale_mock_small.csv",summarize = True)
# Load the astronomical data using the Catalog class
catalog = dataset.Catalog(path="data/cigale_mock_small.csv",summarize = True)

Current working directory: /home/kurt/Documents/agnboost
Looking for bands file at: /home/kurt/Documents/agnboost/agnboost/allowed_bands.json
[INFO] Loaded bands file metadata: This file contains the allowed photometric bands for JWST
[INFO] Loaded 11 allowed bands from agnboost/allowed_bands.json
[INFO] Attempting to load file with delimiter: ','
[INFO] Successfully loaded data with 1000 rows.
[INFO] Found 11 valid band columns:
[INFO]   - jwst.nircam.F115W (F115W): 1.154 μm
[INFO]   - jwst.nircam.F150W (F150W): 1.501 μm
[INFO]   - jwst.nircam.F200W (F200W): 1.988 μm
[INFO]   - jwst.nircam.F277W (F277W): 2.776 μm
[INFO]   - jwst.nircam.F356W (F356W): 3.565 μm
[INFO]   - jwst.nircam.F410M (F410M): 4.083 μm
[INFO]   - jwst.nircam.F444W (F444W): 4.402 μm
[INFO]   - jwst.miri.F770W (F770W): 7.7 μm
[INFO]   - jwst.miri.F1000W (F1000W): 10.0 μm
[INFO]   - jwst.miri.F1500W (F1500W): 15.0 μm
[INFO]   - jwst.miri.F2100W (F2100W): 21.0 μm

================================================================================
DATA SUMMARY: cigale_mock_small.csv
================================================================================
Dimensions: 1000 rows × 26 columns
Memory usage: 0.20 MB
--------------------------------------------------------------------------------
Valid Band Columns:
--------------------------------------------------------------------------------
Column Name                    Shorthand       Wavelength (μm)
--------------------------------------------------------------------------------
jwst.nircam.F115W              F115W           1.154          
jwst.nircam.F150W              F150W           1.501          
jwst.nircam.F200W              F200W           1.988          
jwst.nircam.F277W              F277W           2.776          
jwst.nircam.F356W              F356W           3.565          
jwst.nircam.F410M              F410M           4.083          
jwst.nircam.F444W              F444W           4.402          
jwst.miri.F770W                F770W           7.700          
jwst.miri.F1000W               F1000W          10.000         
jwst.miri.F1500W               F1500W          15.000         
jwst.miri.F2100W               F2100W          21.000         
--------------------------------------------------------------------------------
Column Information:
--------------------------------------------------------------------------------
Column Name                    Type            Non-Null        Null %    
--------------------------------------------------------------------------------
IRAC1                          float64         1000/1000            0.00%     
IRAC2                          float64         1000/1000            0.00%     
IRAC3                          float64         1000/1000            0.00%     
IRAC4                          float64         1000/1000            0.00%     
hst.acs.wfc.F606W              float64         1000/1000            0.00%     
hst.acs.wfc.F814W              float64         1000/1000            0.00%     
hst.wfc3.ir.F125W              float64         1000/1000            0.00%     
hst.wfc3.ir.F140W              float64         1000/1000            0.00%     
hst.wfc3.ir.F160W              float64         1000/1000            0.00%     
jwst.miri.F1000W               float64         1000/1000            0.00%     
jwst.miri.F1280W               float64         1000/1000            0.00%     
jwst.miri.F1500W               float64         1000/1000            0.00%     
jwst.miri.F1800W               float64         1000/1000            0.00%     
jwst.miri.F2100W               float64         1000/1000            0.00%     
jwst.miri.F770W                float64         1000/1000            0.00%     
jwst.nircam.F115W              float64         1000/1000            0.00%     
jwst.nircam.F150W              float64         1000/1000            0.00%     
jwst.nircam.F200W              float64         1000/1000            0.00%     
jwst.nircam.F277W              float64         1000/1000            0.00%     
jwst.nircam.F356W              float64         1000/1000            0.00%     
jwst.nircam.F410M              float64         1000/1000            0.00%     
jwst.nircam.F444W              float64         1000/1000            0.00%     
sfh.sfr100Myrs                 float64         1000/1000            0.00%     
stellar.m_star                 float64         1000/1000            0.00%     
agn.fracAGN                    float64         1000/1000            0.00%     
universe.redshift              float64         1000/1000            0.00%     
--------------------------------------------------------------------------------

Numeric Column Statistics:
--------------------------------------------------------------------------------
Column               Mean         Std          Min          Max         
--------------------------------------------------------------------------------
IRAC1                57.9         1308         2.413e-06    4.098e+04   
IRAC2                22.97        509.9        8.821e-07    1.596e+04   
IRAC3                39.96        918          1.646e-06    2.879e+04   
IRAC4                57.92        1309         2.413e-06    4.099e+04   
hst.acs.wfc.F606W    0.311        5.52         0            169         
hst.acs.wfc.F814W    0.3093       5.099        5.455e-13    155.8       
hst.wfc3.ir.F125W    0.5148       6.576        1.614e-09    192.2       
hst.wfc3.ir.F140W    0.6132       7.125        2.611e-09    196.7       
hst.wfc3.ir.F160W    0.7412       7.991        4.119e-09    200.2       
jwst.miri.F1000W     57.54        1356         3.049e-06    4.257e+04   
jwst.miri.F1280W     71.4         1587         4.006e-06    4.97e+04    
jwst.miri.F1500W     74.16        1638         4.475e-06    5.129e+04   
jwst.miri.F1800W     82.2         1710         4.232e-06    5.339e+04   
jwst.miri.F2100W     87.79        1773         4.001e-06    5.527e+04   
jwst.miri.F770W      58.58        1315         2.288e-06    4.117e+04   
jwst.nircam.F115W    0.461        6.317        1.192e-09    188.6       
jwst.nircam.F150W    0.706        7.721        3.693e-09    198.7       
jwst.nircam.F200W    1.482        15.31        1.959e-08    280.6       
jwst.nircam.F277W    4.441        68.46        1.332e-07    2009        
jwst.nircam.F356W    11.62        225.3        4.48e-07     6973        
jwst.nircam.F410M    17.51        373          6.242e-07    1.164e+04   
jwst.nircam.F444W    21.65        477          8.29e-07     1.492e+04   
sfh.sfr100Myrs       4.765        4.403        4.765e-27    15.79       
stellar.m_star       3.51e+09     2.551e+09    3.367e+07    7.388e+09   
agn.fracAGN          0.4993       0.3164       0            0.99        
universe.redshift    1.765        1.811        0.01         7.999       
================================================================================

Creating Custom Features¶

By default, AGNBoost will create a feature dataframe that includes all of the photometric bands matching valid bands (in the bands.json), all non-reciprocal colors derived from those, and the squares of those colors. However, it is also simple to create a feature dataframe to only include features you want, including features not included by default.

Let's say we want to create a feature dataframe that consists of:

All the valid photometric bands. We will take the log10 of the fluxes.
ONLY the F770W/F444W and F21000/F770W colors
the redshift

To do this, we must first create our custom function to create this dataframe from the data stored in our catalog object. Note that this function is expected to have only one input (the data), and needs to return a pandas Dataframe. We will create this function and then test it to ensure that it is working.

In [4]:

Copied!





# Get the list of the valid photometric bands
VALID_BANDS = catalog.get_valid_bands_list()
print(f"Valid bands: {VALID_BANDS# Navigate to the repository root (parent directory of notebooks/)
os.chdir('..')

# Verify we're in the right place
print(f"Current directory: {os.getcwd()}")
print(f"Contents: {os.listdir('.')}")}")

def custom_feature_func(data: pd.DataFrame) -> pd.DataFrame:
    # Create all the features we want
    log_phot_df = data[VALID_BANDS].apply( np.log10 )

    f770w_f444w_color = np.log10( data['jwst.miri.F770W']/data['jwst.nircam.F444W'] ).rename("F770W/F444W")
    f2100W_f770W_color = np.log10( data['jwst.miri.F2100W']/data['jwst.miri.F770W'] ).rename("F2100W/F770W")

    z_df = data['universe.redshift']

    # Combine them together so that they have shape (N_data, N_features)    
    feature_df = pd.concat( [log_phot_df, f770w_f444w_color, f2100W_f770W_color, z_df], axis=1, join = 'outer')
    return feature_df

# Now, let's test this.
test_feature_df = custom_feature_func(data = catalog.get_data() )
print(f"\ntest_feature_df has shape {test_feature_df.shape} and original data has {len( catalog.get_data() )} rows.")
print(f"test_feature_df consists of {test_feature_df.shape[1]} features: {list(test_feature_df.columns)}\n")
# Get the list of the valid photometric bands
VALID_BANDS = catalog.get_valid_bands_list()
print(f"Valid bands: {VALID_BANDS# Navigate to the repository root (parent directory of notebooks/)
os.chdir('..')

# Verify we're in the right place
print(f"Current directory: {os.getcwd()}")
print(f"Contents: {os.listdir('.')}")}")

def custom_feature_func(data: pd.DataFrame) -> pd.DataFrame:
    # Create all the features we want
    log_phot_df = data[VALID_BANDS].apply( np.log10 )

    f770w_f444w_color = np.log10( data['jwst.miri.F770W']/data['jwst.nircam.F444W'] ).rename("F770W/F444W")
    f2100W_f770W_color = np.log10( data['jwst.miri.F2100W']/data['jwst.miri.F770W'] ).rename("F2100W/F770W")

    z_df = data['universe.redshift']

    # Combine them together so that they have shape (N_data, N_features)    
    feature_df = pd.concat( [log_phot_df, f770w_f444w_color, f2100W_f770W_color, z_df], axis=1, join = 'outer')
    return feature_df

# Now, let's test this.
test_feature_df = custom_feature_func(data = catalog.get_data() )
print(f"\ntest_feature_df has shape {test_feature_df.shape} and original data has {len( catalog.get_data() )} rows.")
print(f"test_feature_df consists of {test_feature_df.shape[1]} features: {list(test_feature_df.columns)}\n")

  Cell In[4], line 3
    print(f"Valid bands: {VALID_BANDS# Navigate to the repository root (parent directory of notebooks/)
          ^
SyntaxError: unterminated string literal (detected at line 3)

This custom feature dataframe is saved into our catalog instance, so we are good to continue from here as usual (i.e., following the same method in the basic-usage.ipynb example).

Applying transformations to the target variable (e.g. redshift)¶

You may wish to apply some form of transformation to your target variable (i.e., that which you are performing regression to predict). For example, in the AGNBoost paper (Hamblin+2025), we applied a modified sigmoid transformation to redshift, in order to transform redshift from the (theoretical) [0,inf) range to (0,1). This allowed us to predict a beta distribution with AGNBoost.

This transformation functionality is built-in to AGNBoost. All we need to do is define the function for the transformation. Let's create the python function for the modified sigmoid transformation above. Note that this modified sigmoid transforamtion has an optional parameter a which is used to tune the effects of the transformation.

We also need to define the inverse of the transformation, in order to transform the predictions from AGNBoost back to the untransformed space of interest (i.e., transformed-redshift space -> redhisft space)

In [ ]:

Copied!





def mod_sigmoid_trans(z, a = 0.4):
    trans_z =  2/ (1 + np.exp(-a*z)) - 1
    return trans_z
    
# The 1e-8 is included in the log to avoid issues of log(0)
def inverse_mod_sigmoid_trans(trans_z, a = 0.4):
    z = -(1/a)*np.log( 2/(1+trans_z) -1 + 1e-8)
    return z
def mod_sigmoid_trans(z, a = 0.4):
    trans_z =  2/ (1 + np.exp(-a*z)) - 1
    return trans_z
    
# The 1e-8 is included in the log to avoid issues of log(0)
def inverse_mod_sigmoid_trans(trans_z, a = 0.4):
    z = -(1/a)*np.log( 2/(1+trans_z) -1 + 1e-8)
    return z

We can now use this transformation to add a transformed redshift column to the saved data in our catalog instance. We will perform a sanity check to ensure that applying the inverse of our transformation to the transformed data matches the original data.

In [ ]:

Copied!





# Let's create a name for the new column. 
# Note that this will also become the name of the model used to predict this transformed variable 
#     (and consequently the name of the directory the models will be saved in)
transformed_col_name = "mod_sigmoid_redshift"

transformed_redshift = catalog.transform( column_name = 'universe.redshift', 
                                         transform_func = mod_sigmoid_trans, 
                                         new_column_name = transformed_col_name
                                        )

print(f"Created transformed redshift column {transformed_redshift.name}")
print(f"Stored data now has columns: {catalog.get_data().columns}")


#--------------------

# Let's ensure that taking the inverse transformation of this returns the redshifts to their original state
original_z = catalog.get_data()['universe.redshift']

# Apply the inverse transformation to our transformed redshift
inverse_trans_z = inverse_mod_sigmoid_trans( transformed_redshift )
mismatch_z = 0

# Iterate through the arrays
for i in range(len(original_z)):  
    if np.around(original_z.iloc[i], decimals = 3) != np.around(inverse_trans_z.iloc[i], decimals = 3):
        mismatch_z += 1
        print(f"redshift mismatch. orig z: {original_z.iloc[i]:.3f}, recovered transformed z: {inverse_trans_z.iloc[i]:.3f}")

if mismatch_z == 0:
    print(f"\nNo redshift mismatches after transorming and transforming back!\n")
# Let's create a name for the new column. 
# Note that this will also become the name of the model used to predict this transformed variable 
#     (and consequently the name of the directory the models will be saved in)
transformed_col_name = "mod_sigmoid_redshift"

transformed_redshift = catalog.transform( column_name = 'universe.redshift', 
                                         transform_func = mod_sigmoid_trans, 
                                         new_column_name = transformed_col_name
                                        )

print(f"Created transformed redshift column {transformed_redshift.name}")
print(f"Stored data now has columns: {catalog.get_data().columns}")


#--------------------

# Let's ensure that taking the inverse transformation of this returns the redshifts to their original state
original_z = catalog.get_data()['universe.redshift']

# Apply the inverse transformation to our transformed redshift
inverse_trans_z = inverse_mod_sigmoid_trans( transformed_redshift )
mismatch_z = 0

# Iterate through the arrays
for i in range(len(original_z)):  
    if np.around(original_z.iloc[i], decimals = 3) != np.around(inverse_trans_z.iloc[i], decimals = 3):
        mismatch_z += 1
        print(f"redshift mismatch. orig z: {original_z.iloc[i]:.3f}, recovered transformed z: {inverse_trans_z.iloc[i]:.3f}")

if mismatch_z == 0:
    print(f"\nNo redshift mismatches after transorming and transforming back!\n")

We could then create an AGNBoost model to perform regression on the transformed redshift:

In [ ]:

Copied!

# First, we will create the feature dataframe
catalog.create_feature_dataframe(silent = True)

agnboost_m = model.AGNBoost( feature_names = catalog.get_feature_names(),
                              target_variables = {'mod_sigmoid_redshift' : 'Beta'}
                           )
print(f"AGNBoost object made with target varible name {list(agnboost_m.get_models().keys())[0]}. A {agnboost_m.get_models()['mod_sigmoid_redshift']} distribution is predicted for this target variable.\n")

# First, we will create the feature dataframe
catalog.create_feature_dataframe(silent = True)

agnboost_m = model.AGNBoost( feature_names = catalog.get_feature_names(),
                              target_variables = {'mod_sigmoid_redshift' : 'Beta'}
                           )
print(f"AGNBoost object made with target varible name {list(agnboost_m.get_models().keys())[0]}. A {agnboost_m.get_models()['mod_sigmoid_redshift']} distribution is predicted for this target variable.\n")