/ notebooks / fast_seqfunc.py
fast_seqfunc.py
  1  # /// script
  2  # requires-python = "<=3.11"
  3  # dependencies = [
  4  #     "anthropic==0.49.0",
  5  #     "marimo",
  6  #     "numpy==1.26.4",
  7  #     "pandas==2.1.4",
  8  #     "pycaret[full]==3.3.2",
  9  #     "scikit-learn==1.4.2",
 10  # ]
 11  # ///
 12  
 13  import marimo
 14  
 15  __generated_with = "0.11.26"
 16  app = marimo.App(width="medium")
 17  
 18  
 19  @app.cell
 20  def _():
 21      from itertools import product
 22  
 23      import numpy as np
 24  
 25      # Protein sequence data
 26      amino_acids = "ACDEFGHIKLMNPQRSTVWY"
 27      protein_length = 10
 28      n_protein_samples = 1000
 29  
 30      # Generate random protein sequences
 31      protein_sequences = [
 32          "".join(np.random.choice(list(amino_acids), protein_length))
 33          for _ in range(n_protein_samples)
 34      ]
 35  
 36      # Complex function for proteins based on:
 37      # - hydrophobicity patterns
 38      # - charge distribution
 39      # - sequence motif presence
 40      hydrophobic = "AILMFWV"
 41      charged = "DEKR"
 42      motif = "KR"
 43  
 44      def protein_function(seq):
 45          # Hydrophobicity score
 46          hydro_score = sum(
 47              1 for i, aa in enumerate(seq) if aa in hydrophobic and i > len(seq) / 2
 48          )
 49  
 50          # Charge distribution
 51          charge_pairs = sum(
 52              1
 53              for i in range(len(seq) - 1)
 54              if seq[i] in charged and seq[i + 1] in charged
 55          )
 56  
 57          # Motif presence with position weight
 58          motif_score = sum(i / len(seq) for i, aa in enumerate(seq) if aa in motif)
 59  
 60          # Combine non-linearly
 61          return (
 62              np.exp(hydro_score * 0.5)
 63              + (charge_pairs**2)
 64              + (motif_score * 3)
 65              + np.sin(hydro_score * charge_pairs * 0.3)
 66          )
 67  
 68      protein_values = np.array([protein_function(seq) for seq in protein_sequences])
 69  
 70      # DNA sequence data
 71      nucleotides = "ACGTU-"
 72      dna_length = 20
 73      n_dna_samples = 1000
 74  
 75      # Generate random DNA sequences
 76      dna_sequences = [
 77          "".join(np.random.choice(list(nucleotides), dna_length))
 78          for _ in range(n_dna_samples)
 79      ]
 80  
 81      # Complex function for DNA based on:
 82      # - GC content variation
 83      # - palindrome presence
 84      # - specific motif positioning
 85      def dna_function(seq):
 86          # GC content with position weights
 87          gc_score = sum(2 / (i + 1) for i, nt in enumerate(seq) if nt in "GC")
 88  
 89          # Palindrome contribution
 90          palindrome_score = sum(
 91              1 for i in range(len(seq) - 3) if seq[i : i + 4] == seq[i : i + 4][::-1]
 92          )
 93  
 94          # TATA-like motif presence with spacing effects
 95          tata_score = 0
 96          for i in range(len(seq) - 3):
 97              if seq[i : i + 2] == "TA" and seq[i + 2 : i + 4] == "TA":
 98                  tata_score += np.log(i + 1)
 99  
100          # Combine non-linearly
101          return (
102              np.exp(gc_score * 0.3)
103              + (palindrome_score**1.5)
104              + np.cos(tata_score) * np.sqrt(gc_score + palindrome_score + 1)
105          )
106  
107      dna_values = np.array([dna_function(seq) for seq in dna_sequences])
108  
109      # Normalize both value sets to similar ranges
110      protein_values = (protein_values - protein_values.mean()) / protein_values.std()
111      dna_values = (dna_values - dna_values.mean()) / dna_values.std()
112      return (
113          amino_acids,
114          charged,
115          dna_function,
116          dna_length,
117          dna_sequences,
118          dna_values,
119          hydrophobic,
120          motif,
121          n_dna_samples,
122          n_protein_samples,
123          np,
124          nucleotides,
125          product,
126          protein_function,
127          protein_length,
128          protein_sequences,
129          protein_values,
130      )
131  
132  
133  @app.cell
134  def _(dna_sequences, dna_values, protein_sequences, protein_values):
135      import pandas as pd
136  
137      protein_df = pd.DataFrame(
138          {"sequence": protein_sequences, "function": protein_values}
139      )
140  
141      dna_df = pd.DataFrame({"sequence": dna_sequences, "function": dna_values})
142      return dna_df, pd, protein_df
143  
144  
145  @app.cell
146  def _(protein_df):
147      protein_df
148      return
149  
150  
151  @app.cell
152  def _(dna_df):
153      dna_df
154      return
155  
156  
157  @app.cell
158  def _(np):
159      def one_hot_encode(sequence, alphabet, flatten=False):
160          seq_length = len(sequence)
161          alphabet_length = len(alphabet)
162  
163          # Create mapping from characters to indices
164          char_to_idx = {char: idx for idx, char in enumerate(alphabet)}
165  
166          # Initialize one-hot matrix
167          one_hot = np.zeros((alphabet_length, seq_length))
168  
169          # Fill the matrix
170          for pos, char in enumerate(sequence):
171              one_hot[char_to_idx[char], pos] = 1
172  
173          if flatten:
174              return one_hot.flatten()
175          return one_hot
176  
177      return (one_hot_encode,)
178  
179  
180  @app.cell
181  def _(
182      amino_acids,
183      dna_sequences,
184      dna_values,
185      np,
186      nucleotides,
187      one_hot_encode,
188      pd,
189      protein_sequences,
190      protein_values,
191  ):
192      from sklearn.model_selection import train_test_split
193  
194      # One-hot encode sequences
195      protein_encoded = np.array(
196          [one_hot_encode(seq, amino_acids, flatten=True) for seq in protein_sequences]
197      )
198      dna_encoded = np.array(
199          [one_hot_encode(seq, nucleotides, flatten=True) for seq in dna_sequences]
200      )
201  
202      # Create new dataframes with encoded sequences
203      protein_encoded_df = pd.DataFrame(protein_encoded, index=protein_sequences)
204      protein_encoded_df["function"] = protein_values
205  
206      dna_encoded_df = pd.DataFrame(dna_encoded, index=dna_sequences)
207      dna_encoded_df["function"] = dna_values
208  
209      # Split data into train (60%), test (20%), and heldout (20%) sets
210      train_size = 0.6
211      test_size = 0.2
212      random_state = 42
213  
214      # Protein data splits
215      protein_train, protein_temp = train_test_split(
216          protein_encoded_df, train_size=train_size, random_state=random_state
217      )
218      protein_test, protein_heldout = train_test_split(
219          protein_temp, test_size=0.5, random_state=random_state
220      )
221  
222      # DNA data splits
223      dna_train, dna_temp = train_test_split(
224          dna_encoded_df, train_size=train_size, random_state=random_state
225      )
226      dna_test, dna_heldout = train_test_split(
227          dna_temp, test_size=0.5, random_state=random_state
228      )
229      return (
230          dna_encoded,
231          dna_encoded_df,
232          dna_heldout,
233          dna_temp,
234          dna_test,
235          dna_train,
236          protein_encoded,
237          protein_encoded_df,
238          protein_heldout,
239          protein_temp,
240          protein_test,
241          protein_train,
242          random_state,
243          test_size,
244          train_size,
245          train_test_split,
246      )
247  
248  
249  @app.cell
250  def _(protein_train):
251      from pycaret.regression import setup
252  
253      s = setup(protein_train, target="function", session_id=123)
254      return s, setup
255  
256  
257  @app.cell
258  def _():
259      from pycaret.regression import compare_models
260  
261      best = compare_models()
262      return best, compare_models
263  
264  
265  @app.cell
266  def _(best):
267      from pycaret.regression import evaluate_model, plot_model
268  
269      plot_model(best, plot="residuals")
270      return evaluate_model, plot_model
271  
272  
273  @app.cell
274  def _(best, plot_model):
275      plot_model(best, plot="error")
276      return
277  
278  
279  @app.cell
280  def _(best):
281      from pycaret.regression import predict_model
282  
283      predict_model(best)
284      return (predict_model,)
285  
286  
287  @app.cell
288  def _(best, predict_model, protein_heldout):
289      predictions = predict_model(best, data=protein_heldout)
290      predictions["function"].sort_values(ascending=False)
291  
292      return (predictions,)
293  
294  
295  if __name__ == "__main__":
296      app.run()