fast_seqfunc.py
1 # /// script 2 # requires-python = "<=3.11" 3 # dependencies = [ 4 # "anthropic==0.49.0", 5 # "marimo", 6 # "numpy==1.26.4", 7 # "pandas==2.1.4", 8 # "pycaret[full]==3.3.2", 9 # "scikit-learn==1.4.2", 10 # ] 11 # /// 12 13 import marimo 14 15 __generated_with = "0.11.26" 16 app = marimo.App(width="medium") 17 18 19 @app.cell 20 def _(): 21 from itertools import product 22 23 import numpy as np 24 25 # Protein sequence data 26 amino_acids = "ACDEFGHIKLMNPQRSTVWY" 27 protein_length = 10 28 n_protein_samples = 1000 29 30 # Generate random protein sequences 31 protein_sequences = [ 32 "".join(np.random.choice(list(amino_acids), protein_length)) 33 for _ in range(n_protein_samples) 34 ] 35 36 # Complex function for proteins based on: 37 # - hydrophobicity patterns 38 # - charge distribution 39 # - sequence motif presence 40 hydrophobic = "AILMFWV" 41 charged = "DEKR" 42 motif = "KR" 43 44 def protein_function(seq): 45 # Hydrophobicity score 46 hydro_score = sum( 47 1 for i, aa in enumerate(seq) if aa in hydrophobic and i > len(seq) / 2 48 ) 49 50 # Charge distribution 51 charge_pairs = sum( 52 1 53 for i in range(len(seq) - 1) 54 if seq[i] in charged and seq[i + 1] in charged 55 ) 56 57 # Motif presence with position weight 58 motif_score = sum(i / len(seq) for i, aa in enumerate(seq) if aa in motif) 59 60 # Combine non-linearly 61 return ( 62 np.exp(hydro_score * 0.5) 63 + (charge_pairs**2) 64 + (motif_score * 3) 65 + np.sin(hydro_score * charge_pairs * 0.3) 66 ) 67 68 protein_values = np.array([protein_function(seq) for seq in protein_sequences]) 69 70 # DNA sequence data 71 nucleotides = "ACGTU-" 72 dna_length = 20 73 n_dna_samples = 1000 74 75 # Generate random DNA sequences 76 dna_sequences = [ 77 "".join(np.random.choice(list(nucleotides), dna_length)) 78 for _ in range(n_dna_samples) 79 ] 80 81 # Complex function for DNA based on: 82 # - GC content variation 83 # - palindrome presence 84 # - specific motif positioning 85 def dna_function(seq): 86 # GC content with position weights 87 gc_score = sum(2 / (i + 1) for i, nt in enumerate(seq) if nt in "GC") 88 89 # Palindrome contribution 90 palindrome_score = sum( 91 1 for i in range(len(seq) - 3) if seq[i : i + 4] == seq[i : i + 4][::-1] 92 ) 93 94 # TATA-like motif presence with spacing effects 95 tata_score = 0 96 for i in range(len(seq) - 3): 97 if seq[i : i + 2] == "TA" and seq[i + 2 : i + 4] == "TA": 98 tata_score += np.log(i + 1) 99 100 # Combine non-linearly 101 return ( 102 np.exp(gc_score * 0.3) 103 + (palindrome_score**1.5) 104 + np.cos(tata_score) * np.sqrt(gc_score + palindrome_score + 1) 105 ) 106 107 dna_values = np.array([dna_function(seq) for seq in dna_sequences]) 108 109 # Normalize both value sets to similar ranges 110 protein_values = (protein_values - protein_values.mean()) / protein_values.std() 111 dna_values = (dna_values - dna_values.mean()) / dna_values.std() 112 return ( 113 amino_acids, 114 charged, 115 dna_function, 116 dna_length, 117 dna_sequences, 118 dna_values, 119 hydrophobic, 120 motif, 121 n_dna_samples, 122 n_protein_samples, 123 np, 124 nucleotides, 125 product, 126 protein_function, 127 protein_length, 128 protein_sequences, 129 protein_values, 130 ) 131 132 133 @app.cell 134 def _(dna_sequences, dna_values, protein_sequences, protein_values): 135 import pandas as pd 136 137 protein_df = pd.DataFrame( 138 {"sequence": protein_sequences, "function": protein_values} 139 ) 140 141 dna_df = pd.DataFrame({"sequence": dna_sequences, "function": dna_values}) 142 return dna_df, pd, protein_df 143 144 145 @app.cell 146 def _(protein_df): 147 protein_df 148 return 149 150 151 @app.cell 152 def _(dna_df): 153 dna_df 154 return 155 156 157 @app.cell 158 def _(np): 159 def one_hot_encode(sequence, alphabet, flatten=False): 160 seq_length = len(sequence) 161 alphabet_length = len(alphabet) 162 163 # Create mapping from characters to indices 164 char_to_idx = {char: idx for idx, char in enumerate(alphabet)} 165 166 # Initialize one-hot matrix 167 one_hot = np.zeros((alphabet_length, seq_length)) 168 169 # Fill the matrix 170 for pos, char in enumerate(sequence): 171 one_hot[char_to_idx[char], pos] = 1 172 173 if flatten: 174 return one_hot.flatten() 175 return one_hot 176 177 return (one_hot_encode,) 178 179 180 @app.cell 181 def _( 182 amino_acids, 183 dna_sequences, 184 dna_values, 185 np, 186 nucleotides, 187 one_hot_encode, 188 pd, 189 protein_sequences, 190 protein_values, 191 ): 192 from sklearn.model_selection import train_test_split 193 194 # One-hot encode sequences 195 protein_encoded = np.array( 196 [one_hot_encode(seq, amino_acids, flatten=True) for seq in protein_sequences] 197 ) 198 dna_encoded = np.array( 199 [one_hot_encode(seq, nucleotides, flatten=True) for seq in dna_sequences] 200 ) 201 202 # Create new dataframes with encoded sequences 203 protein_encoded_df = pd.DataFrame(protein_encoded, index=protein_sequences) 204 protein_encoded_df["function"] = protein_values 205 206 dna_encoded_df = pd.DataFrame(dna_encoded, index=dna_sequences) 207 dna_encoded_df["function"] = dna_values 208 209 # Split data into train (60%), test (20%), and heldout (20%) sets 210 train_size = 0.6 211 test_size = 0.2 212 random_state = 42 213 214 # Protein data splits 215 protein_train, protein_temp = train_test_split( 216 protein_encoded_df, train_size=train_size, random_state=random_state 217 ) 218 protein_test, protein_heldout = train_test_split( 219 protein_temp, test_size=0.5, random_state=random_state 220 ) 221 222 # DNA data splits 223 dna_train, dna_temp = train_test_split( 224 dna_encoded_df, train_size=train_size, random_state=random_state 225 ) 226 dna_test, dna_heldout = train_test_split( 227 dna_temp, test_size=0.5, random_state=random_state 228 ) 229 return ( 230 dna_encoded, 231 dna_encoded_df, 232 dna_heldout, 233 dna_temp, 234 dna_test, 235 dna_train, 236 protein_encoded, 237 protein_encoded_df, 238 protein_heldout, 239 protein_temp, 240 protein_test, 241 protein_train, 242 random_state, 243 test_size, 244 train_size, 245 train_test_split, 246 ) 247 248 249 @app.cell 250 def _(protein_train): 251 from pycaret.regression import setup 252 253 s = setup(protein_train, target="function", session_id=123) 254 return s, setup 255 256 257 @app.cell 258 def _(): 259 from pycaret.regression import compare_models 260 261 best = compare_models() 262 return best, compare_models 263 264 265 @app.cell 266 def _(best): 267 from pycaret.regression import evaluate_model, plot_model 268 269 plot_model(best, plot="residuals") 270 return evaluate_model, plot_model 271 272 273 @app.cell 274 def _(best, plot_model): 275 plot_model(best, plot="error") 276 return 277 278 279 @app.cell 280 def _(best): 281 from pycaret.regression import predict_model 282 283 predict_model(best) 284 return (predict_model,) 285 286 287 @app.cell 288 def _(best, predict_model, protein_heldout): 289 predictions = predict_model(best, data=protein_heldout) 290 predictions["function"].sort_values(ascending=False) 291 292 return (predictions,) 293 294 295 if __name__ == "__main__": 296 app.run()