/ reference / dinoxor.s
dinoxor.s
  1  .global _dinoxor
  2  .text
  3  
  4  // Function: dinoxor
  5  // Description: Orchestrates a series of operations to replicate a bitwise XOR 
  6  //              operation using NEON registers. This function initializes a XOR truth table,
  7  //              calculates an index based on provided Xindex and Yindex, and creates a 
  8  //              multiplication table to help calculate the index of each byte into the truth table.
  9  // Arguments:
 10  //   - x0: First operand byte
 11  //   - x1: Second operand byte
 12  // Returns: The result of the XOR operation between the two input bytes in w0.
 13  _dinoxor:
 14      // Prologue: Prepare the stack and save callee-saved registers
 15      stp x29, x30, [sp, #-16]!  // Save the frame pointer and return address on the stack
 16      mov x29, sp                // Update the frame pointer to the current stack pointer
 17  
 18      mov x2, #0                 // Initialize x2 to 0 (not used later)
 19      eor v0.16b, v0.16b, v0.16b // Clear the contents of v0 (set all bits to 0)
 20  
 21      bl spread_bits_to_bytes    // Call the spread_bits_to_bytes function to load the first operand byte into the lower half of v2
 22      mov x0, x1                 // Move the second operand byte into x0
 23      bl spread_bits_to_bytes    // Call the spread_bits_to_bytes function to load the second operand byte into the lower half of v2 (shifting the previous value to upper)
 24      // After the above operations
 25      // For the inputs:
 26      //   x0 = 0b10101010
 27      //   x1 = 0b11111111
 28      // v2 contains:
 29      // v2 = {0x00 0x01 0x00 0x01 0x00 0x01 0x00 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01}
 30  
 31      bl prepare_xor_truth_table // Call the prepare_xor_truth_table function to initialize the XOR truth table in v0
 32      // After the above operation, v0 contains:
 33      // v0 = {0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00}
 34  
 35      bl prepare_multiplication_table // Call the prepare_multiplication_table function to initialize the multiplication table in v1
 36      // After the above operation, v1 contains:
 37      // v1 = {0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01}
 38  
 39      bl calculate_xor_result // Call the calculate_xor_result function to calculate and store the XOR'd byte in w0.
 40  
 41      // Epilogue: Restore the stack and callee-saved registers
 42      ldp x29, x30, [sp], #16    // Restore the frame pointer and return address from the stack
 43  
 44      ret                        // Return to the caller
 45  
 46  // Function: spread_bits_to_bytes
 47  // Description: Spreads the bits of a byte into separate bytes in a NEON register.
 48  // Arguments:
 49  //   - x0: The input byte to be spread
 50  // Returns: None (the result is stored in v2)
 51  spread_bits_to_bytes:
 52      // Clear the destination vector registers
 53      eor v1.16b, v1.16b, v1.16b
 54      eor v2.16b, v2.16b, v2.16b
 55  
 56      mov w2, #0                 // Initialize the counter for bit positions (0-7)
 57  
 58  spread_bit_loop:
 59      lsr w3, w0, w2             // Shift the input byte right by the current bit position to bring the target bit to the LSB
 60      and w3, w3, #0x01          // Isolate the LSB (which is now the target bit)
 61      
 62      mov w4, w3                 // Move the processed bit to w4 (to ensure w4 is correctly updated before duplication)
 63      
 64      ext v2.16b, v0.16b, v0.16b, #1  // Shift v0 left by one byte to make space for the new byte
 65      ins v2.b[0], w4                 // Insert the new byte at position 0 of v2
 66      mov v0.16b, v2.16b              // Move the temporary result back to v0
 67  
 68      add w2, w2, #1             // Increment the bit position counter
 69      cmp w2, #8                 // Compare the counter with 8 (number of bits in a byte)
 70      b.lt spread_bit_loop       // If the counter is less than 8, continue the loop
 71  
 72      ext v2.16b, v0.16b, v0.16b, #1  // Shift the last byte inserted into its final position in v2
 73      
 74      ret
 75  
 76  // Function: prepare_xor_truth_table
 77  // Description: Prepares the XOR truth table in a NEON register.
 78  // Arguments: None
 79  // Returns: None (the truth table is stored in v0)
 80  prepare_xor_truth_table:
 81      // Load a pattern into a general-purpose register
 82      movz w8, #0x0001, lsl #16  // Load 0x0001 into the upper half of w8 (bits 16-31)
 83      movk w8, #0x0100           // Overlay 0x0100 into the lower half of w8 (bits 0-15)
 84      
 85      dup v0.4s, w8              // Duplicate the 32-bit value in w8 across all lanes of v0
 86      // After the above operation, v0 contains:
 87      // v0 = {0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00 0x00 0x01 0x01 0x00}
 88      
 89      ret
 90  
 91  // Function: prepare_multiplication_table
 92  // Description: Sets up a multiplication table in v1 to help calculate the index of each byte into the truth table.
 93  // Arguments: None
 94  // Returns: None (the multiplication table is stored in v1)
 95  prepare_multiplication_table:
 96      // Load the patterns into NEON registers
 97      movi v1.8b, #0x02  // Set the lower half of v1 to 0x02
 98      movi v8.8b, #0x01  // Set the lower half of v8 to 0x01
 99      
100      mov v1.d[1], v8.d[0]  // Move the lower half of v8 to the upper half of v1
101      // After the above operations, v1 contains:
102      // v1 = {0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x02 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01}
103      
104      ret
105  
106  // Function: calculate_xor_result
107  // Description: Calculates the XOR result using the prepared data in NEON registers.
108  //              It multiplies the spread bits by the multiplication table to get the indices,
109  //              performs a table lookup using the XOR truth table, and then multiplies the result
110  //              by a predefined pattern to obtain the final XOR result.
111  // Arguments:
112  //   - v0: The XOR truth table
113  //   - v1: The multiplication table
114  //   - v2: The spread bits of the input operands
115  // Returns: The XOR result of the input operands in w0
116  calculate_xor_result:
117      mul v3.16b, v2.16b, v1.16b // Multiply each byte in v2 (spread bits) by its corresponding byte in v1 (multiplication table)
118                                 // The upper half of v3 now contains the relevant Xindexes
119      mov v3.d[1], v2.d[1]       // Move the upper half of v2 (Yindexes) to the lower half of v3
120      ext.16b v1, v3, v3, #8     // Extract the upper half of v3 and store it in v1
121      add.16b v1, v3, v1         // Add v3 and v1 to get the final indices for the truth table lookup
122      mov.d v1[1], xzr           // Clear the upper half of v1 (set it to 0)
123      tbl.8b v1, {v0}, v1        // Perform a table lookup using the indices in v1 and the truth table in v0
124                                 // Store the result in v1
125  
126      // Set up v0 with the desired values for the multiplication
127      movz x1, #0x0201, lsl #0   // Load the lower 16 bits of x1 with 0x0201
128      movk x1, #0x0804, lsl #16  // Load the next 16 bits of x1 with 0x0804
129      movk x1, #0x2010, lsl #32  // Load the next 16 bits of x1 with 0x2010
130      movk x1, #0x8040, lsl #48  // Load the upper 16 bits of x1 with 0x8040
131      mov v0.d[0], x1            // Move the 64-bit value from x1 to the lower half of v0
132  
133      mul v1.16b, v1.16b, v0.16b // Multiply v1 (table lookup result) by v0 (predefined pattern) element-wise
134      addv b0, v1.8b             // Sum the values in the lower half of v1 and store the result in b0
135      umov w0, v0.b[0]           // Move the 8-bit scalar value from b0 to w0 (return value)
136  
137      ret