Cradicle Explorer

/ .github / workflows / gemini-pr-review.py
gemini-pr-review.py
  1  import json
  2  import os
  3  import sys
  4  import time
  5  import random
  6  import datetime
  7  import urllib.parse
  8  import traceback
  9  from pathlib import Path
 10  from typing import List, Dict, Any, Optional, Tuple, Union, Iterable
 11  import google.generativeai as Client
 12  from github import Github, GithubException
 13  import requests
 14  import fnmatch
 15  import re
 16  from unidiff import Hunk, PatchedFile, PatchSet
 17  from unidiff.patch import Line
 18  
 19  # Initialize clients
 20  def initialize_clients():
 21      try:
 22          if os.environ.get("GEMINI_TEST_MODE") == "1":
 23              print("Test mode: Skipping GitHub and Gemini client initialization")
 24              return None, None
 25  
 26          github_token = os.environ.get("GITHUB_TOKEN")
 27          if not github_token:
 28              print("Error: GITHUB_TOKEN environment variable is required.")
 29              sys.exit(1)
 30          gh_client = Github(github_token)
 31  
 32          gemini_api_key = os.environ.get("GEMINI_API_KEY")
 33          if not gemini_api_key:
 34              print("Error: GEMINI_API_KEY environment variable is required.")
 35              sys.exit(1)
 36          Client.configure(api_key=gemini_api_key)
 37          gemini_client_module = Client # Use the configured module
 38  
 39          return gh_client, gemini_client_module
 40      except Exception as e:
 41          print(f"Error during client initialization: {e}")
 42          traceback.print_exc()
 43          sys.exit(1)
 44  
 45  gh, gemini_client_module = initialize_clients()
 46  
 47  
 48  class PRDetails:
 49      def __init__(self, owner: str, repo_name_str: str, pull_number: int, title: str, description: str, repo_obj=None, pr_obj=None, event_type: str = None):
 50          self.owner = owner
 51          self.repo_name = repo_name_str
 52          self.pull_number = pull_number
 53          self.title = title
 54          self.description = description
 55          self.repo_obj = repo_obj
 56          self.pr_obj = pr_obj
 57          self.event_type = event_type
 58  
 59      def get_full_repo_name(self):
 60          return f"{self.owner}/{self.repo_name}"
 61  
 62  
 63  def get_pr_details() -> PRDetails:
 64      github_event_path = os.environ.get("GITHUB_EVENT_PATH")
 65      if not github_event_path:
 66          print("Error: GITHUB_EVENT_PATH environment variable not set.")
 67          sys.exit(1)
 68  
 69      with open(github_event_path, "r", encoding="utf-8") as f:
 70          event_data = json.load(f)
 71  
 72      event_name = os.environ.get("GITHUB_EVENT_NAME")
 73      pr_event_type = None
 74  
 75      if event_name == "issue_comment":
 76          if "issue" in event_data and "pull_request" in event_data["issue"]:
 77              pull_number = event_data["issue"]["number"]
 78              repo_full_name = event_data["repository"]["full_name"]
 79              pr_event_type = "comment"
 80          else:
 81              print("Error: issue_comment event not on a pull request.")
 82              sys.exit(1)
 83      elif event_name == "pull_request":
 84          pull_number = event_data["pull_request"]["number"]
 85          repo_full_name = event_data["repository"]["full_name"]
 86          pr_event_type = event_data.get("action")
 87          print(f"Pull request event action: {pr_event_type}")
 88      else:
 89          print(f"Error: Unsupported GITHUB_EVENT_NAME: {event_name}")
 90          sys.exit(1)
 91  
 92      owner, repo_name_str = repo_full_name.split("/")
 93  
 94      try:
 95          repo_obj = gh.get_repo(repo_full_name)
 96          pr_obj = repo_obj.get_pull(pull_number)
 97      except GithubException as e:
 98          print(f"Error accessing GitHub repository or PR: {e}")
 99          sys.exit(1)
100      except Exception as e:
101          print(f"An unexpected error occurred while fetching PR details: {e}")
102          sys.exit(1)
103  
104      return PRDetails(owner, repo_name_str, pull_number, pr_obj.title, pr_obj.body or "", repo_obj, pr_obj, pr_event_type)
105  
106  
107  def get_diff(pr_details: PRDetails, comparison_sha: Optional[str] = None) -> str:
108      repo = pr_details.repo_obj
109      pr = pr_details.pr_obj
110      head_sha = pr.head.sha
111  
112      if comparison_sha:
113          print(f"Getting diff comparing HEAD ({head_sha}) against specified SHA ({comparison_sha})")
114          try:
115              comparison_obj = repo.compare(comparison_sha, head_sha)
116              diff_parts = []
117              for file_diff in comparison_obj.files:
118                  if file_diff.patch:
119                      # Construct a valid diff header format for unidiff
120                      source_file_path_for_header = file_diff.previous_filename if file_diff.status == 'renamed' else file_diff.filename
121                      target_file_path_for_header = file_diff.filename
122                      
123                      diff_header = f"diff --git a/{source_file_path_for_header} b/{target_file_path_for_header}\n"
124                      if file_diff.status == 'added':
125                          diff_header += f"new file mode {getattr(file_diff, 'mode', '100644')}\n"
126                          diff_header += f"index 0000000..{file_diff.sha[:7]}\n"
127                      elif file_diff.status == 'deleted':
128                          diff_header += f"deleted file mode {getattr(file_diff, 'mode', '100644')}\n"
129                          diff_header += f"index {file_diff.sha[:7]}..0000000\n"
130                      elif file_diff.status == 'renamed':
131                          diff_header += f"similarity index {getattr(file_diff, 'similarity_index', '100')}%\n"
132                          diff_header += f"rename from {source_file_path_for_header}\n" # already set as prev_filename
133                          diff_header += f"rename to {target_file_path_for_header}\n"   # already set as filename
134                          if hasattr(file_diff, 'sha'): # If it's a rename with modifications
135                               diff_header += f"index {getattr(file_diff, 'previous_sha', '0000000')[:7]}..{file_diff.sha[:7]}\n"
136                      elif file_diff.status == 'modified':
137                           # For modified files, the index line shows old SHA..new SHA
138                           # PyGithub's file_diff.sha is the new SHA. We need the old one if available,
139                           # or rely on the patch content itself to have it.
140                           # For simplicity, we'll rely on the patch content for modified index line.
141                           pass
142  
143  
144                      patch_content = file_diff.patch
145                      
146                      # Ensure --- and +++ lines are present, this is critical for unidiff
147                      # The patch from GitHub API usually has these, but repo.compare() might be different.
148                      lines = patch_content.splitlines()
149                      final_patch_lines = []
150  
151                      # Check if patch already contains valid ---/+++ for THESE filenames
152                      # This logic can be complex if file_diff.patch is not a standard unidiff snippet
153                      # For repo.compare, file_diff.patch should be a standard diff hunk content.
154                      
155                      # Simplification: Assume file_diff.patch from repo.compare is the core hunk data
156                      # and we need to wrap it correctly for unidiff.
157                      final_patch_lines.append(f"--- a/{source_file_path_for_header}")
158                      final_patch_lines.append(f"+++ b/{target_file_path_for_header}")
159                      final_patch_lines.extend(lines) # Add the actual patch lines (hunks)
160                                          
161                      diff_parts.append(diff_header + "\n".join(final_patch_lines))
162  
163              if diff_parts:
164                  diff_text = "\n".join(diff_parts) # Each element in diff_parts is a full diff for one file
165                  print(f"Retrieved diff (length: {len(diff_text)}) using repo.compare('{comparison_sha}', '{head_sha}')")
166                  return diff_text
167              else:
168                  print(f"No changes found comparing {comparison_sha} to {head_sha}")
169                  return ""
170          except GithubException as e:
171              print(f"Error getting comparison diff (compare {comparison_sha} vs {head_sha}): {e}. Falling back.")
172          except Exception as e:
173              print(f"Unexpected error during repo.compare: {e}. Falling back.")
174              traceback.print_exc()
175  
176  
177      print(f"Falling back to pr.get_diff() for PR #{pr_details.pull_number}")
178      try:
179          diff_text = pr.get_diff() # This is usually well-formatted for unidiff
180          if diff_text:
181              print(f"Retrieved diff (length: {len(diff_text)}) using pr.get_diff()")
182              return diff_text
183          else:
184              print("pr.get_diff() returned no content.")
185              return ""
186      except GithubException as e:
187          print(f"Error getting diff using pr.get_diff(): {e}. Falling back further.")
188      except Exception as e:
189          print(f"Unexpected error during pr.get_diff(): {e}. Falling back further.")
190  
191      print(f"Falling back to direct API request for PR diff for PR #{pr_details.pull_number}")
192      api_url = f"https://api.github.com/repos/{pr_details.get_full_repo_name()}/pulls/{pr_details.pull_number}"
193      headers = {
194          'Authorization': f'token {os.environ["GITHUB_TOKEN"]}',
195          'Accept': 'application/vnd.github.v3.diff'
196      }
197      try:
198          response = requests.get(api_url, headers=headers, timeout=30)
199          response.raise_for_status()
200          diff_text = response.text
201          print(f"Retrieved diff (length: {len(diff_text)}) via direct API call.")
202          return diff_text
203      except requests.exceptions.RequestException as e:
204          print(f"Failed to get diff via direct API call: {e}")
205      except Exception as e:
206          print(f"Unexpected error during direct API call for diff: {e}")
207  
208      print("All methods to retrieve diff failed.")
209      return ""
210  
211  
212  def get_hunk_representation(hunk: Hunk) -> str:
213      return str(hunk)
214  
215  
216  def get_file_content(file_path: str) -> str:
217      full_file_content = ""
218      code_extensions = [
219          ".py", ".js", ".jsx", ".ts", ".tsx", ".html", ".css", ".scss", ".java",
220          ".c", ".cpp", ".h", ".hpp", ".go", ".rs", ".php", ".rb", ".sh", ".bash",
221          ".json", ".yml", ".yaml", ".toml", ".md"
222      ]
223      is_code_file = any(file_path.endswith(ext) for ext in code_extensions)
224  
225      if not is_code_file:
226          print(f"Skipping full file context for non-code or binary-like file: {file_path}")
227          return ""
228  
229      try:
230          p_file_path = Path(file_path)
231          if p_file_path.exists() and p_file_path.is_file():
232              file_stat = p_file_path.stat()
233              max_initial_read_bytes = 300000
234  
235              if file_stat.st_size > max_initial_read_bytes:
236                  print(f"File {file_path} is very large ({file_stat.st_size} bytes). Reading a truncated version for context.")
237                  with open(p_file_path, 'r', encoding='utf-8', errors='ignore') as f:
238                      start_content = f.read(max_initial_read_bytes // 2)
239                  full_file_content = start_content + "\n\n... [content truncated due to very large size] ...\n\n"
240              else:
241                   with open(p_file_path, 'r', encoding='utf-8', errors='ignore') as f:
242                      full_file_content = f.read()
243  
244              max_char_len_for_context = 150000
245              if len(full_file_content) > max_char_len_for_context:
246                  print(f"File content for {file_path} still too long after initial read ({len(full_file_content)} chars), further truncating for Gemini context.")
247                  half_len = max_char_len_for_context // 2
248                  full_file_content = full_file_content[:half_len] + \
249                                      "\n\n... [content context truncated for brevity] ...\n\n" + \
250                                      full_file_content[-half_len:]
251  
252              print(f"Read file content for {file_path} (length: {len(full_file_content)} chars after potential truncation).")
253          else:
254              print(f"File {file_path} does not exist locally or is not a file. Cannot provide full context.")
255      except Exception as e:
256          print(f"Error reading full file content for {file_path}: {e}")
257          traceback.print_exc()
258      return full_file_content
259  
260  
261  def create_batch_prompt(patched_file: PatchedFile, pr_details: PRDetails) -> str:
262      full_file_content_for_context = get_file_content(patched_file.path)
263  
264      combined_hunks_text = ""
265      for i, hunk in enumerate(patched_file):
266          hunk_text = get_hunk_representation(hunk)
267          if not hunk_text.strip():
268              continue
269  
270          separator = ("-" * 20) + f" Hunk {i+1} (0-indexed: {i}) " + ("-" * 20) + "\n"
271          combined_hunks_text += ("\n\n" if i > 0 else "") + separator + hunk_text
272  
273      instructions = """Your task is reviewing pull requests. Instructions:
274  - Provide the response in the following JSON format: {"reviews": [{"hunkIndex": <hunk_index_0_based>, "lineNumber": <line_number_in_hunk_content_1_based>, "reviewComment": "<review_comment_using_github_markdown>", "confidence": "<High|Medium|Low>"}]}
275  - `hunkIndex` is 0-based, referring to which hunk in the *provided diff below* the comment applies to (matches the 'Hunk X (0-indexed: Y)' header).
276  - `lineNumber` is 1-based, relative to the *content lines* within that specific hunk (i.e., line 1 is the first line *after* the '@@ ... @@' header of that hunk). These are the lines starting with '+', '-', or space.
277  - `confidence` indicates your certainty and the potential impact: "High" (likely critical issue), "Medium" (potential issue/best practice), "Low" (minor suggestion/nitpick).
278  - Provide comments if there is something genuinely to improve or discuss. If no issues, "reviews" should be an empty array. Consider the severity of the issue when deciding to comment.
279  - Use GitHub Markdown for `reviewComment`.
280  - Focus on: bugs, security vulnerabilities, performance bottlenecks, unclear logic, anti-patterns, and violations of SOLID principles or other key design patterns. High-impact issues are preferred.
281  - Make comments actionable. Suggest improvements or ask clarifying questions.
282  - DO NOT suggest adding comments to the code itself (e.g., "add a comment here explaining X").
283  - NOTE: Basic formatting/linting is handled by Biome. Focus on substantive issues. Do not comment on minor style issues. You are reviewing the *final* auto-formatted/linted code.
284  - Carefully analyze the full file context (if provided) and PR context before making suggestions to avoid hallucinations or irrelevant points.
285  - Only suggest changes relevant to the diff. Do not comment on unrelated code unless directly impacted by the changes in the diff.
286  - Be concise and clear.
287  """
288  
289      pr_context = f"\nPull Request Title: {pr_details.title}\nPull Request Description:\n---\n{pr_details.description or 'No description provided.'}\n---\n"
290  
291      file_context_header = ""
292      file_content_block = ""
293      if full_file_content_for_context:
294          file_context_header = "\nFull content of the file for better context (it may be truncated if too large):\n"
295          file_ext = Path(patched_file.path).suffix[1:]
296          file_content_block = f"```{file_ext or 'text'}\n{full_file_content_for_context}\n```\n"
297  
298      diff_to_review_header = f"\nReview the following code diffs for the file \"{patched_file.path}\" ({len(list(patched_file))} hunks):\n"
299      diff_block = f"```diff\n{combined_hunks_text}\n```"
300  
301      return instructions + pr_context + file_context_header + file_content_block + diff_to_review_header + diff_block
302  
303  
304  LAST_GEMINI_REQUEST_TIME = 0
305  GEMINI_RPM_LIMIT = 45
306  GEMINI_REQUEST_INTERVAL_SECONDS = 60.0 / GEMINI_RPM_LIMIT
307  
308  def enforce_gemini_rate_limits():
309      global LAST_GEMINI_REQUEST_TIME
310      current_time = time.time()
311      time_since_last = current_time - LAST_GEMINI_REQUEST_TIME
312      if time_since_last < GEMINI_REQUEST_INTERVAL_SECONDS:
313          wait_time = GEMINI_REQUEST_INTERVAL_SECONDS - time_since_last
314          print(f"Gemini Rate Limiter: Waiting {wait_time:.2f} seconds.")
315          time.sleep(wait_time)
316      LAST_GEMINI_REQUEST_TIME = time.time()
317  
318  
319  def get_ai_response_with_retry(prompt: str, max_retries: int = 3) -> List[Dict[str, Any]]:
320      model_name = os.environ.get('GEMINI_MODEL', 'gemini-1.5-flash-latest')
321  
322      if not gemini_client_module:
323          print("Error: Gemini client module not initialized. Cannot make API call.")
324          return []
325  
326      try:
327          gemini_model = gemini_client_module.GenerativeModel(model_name)
328      except Exception as e:
329          print(f"Error creating GenerativeModel instance with {model_name}: {e}")
330          return []
331  
332      generation_config = {
333          "max_output_tokens": 8192,
334          "temperature": 0.5, # Increased slightly from 0.4
335          "top_p": 0.95,
336      }
337  
338      safety_settings = [
339          {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
340          {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
341          {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
342          {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
343      ]
344  
345      # Log the prompt before sending
346      # For very long prompts, log only a summary or start/end
347      prompt_log_max_len = 2000 
348      if len(prompt) > prompt_log_max_len:
349          print(f"Full prompt (length {len(prompt)}). Start:\n{prompt[:prompt_log_max_len//2]}...\n...End:\n{prompt[-(prompt_log_max_len//2):]}")
350      else:
351          print(f"Full prompt:\n{prompt}")
352  
353  
354      for attempt in range(1, max_retries + 1):
355          try:
356              enforce_gemini_rate_limits()
357              print(f"Attempt {attempt}/{max_retries} - Sending prompt to Gemini model {model_name}...")
358  
359              response = gemini_model.generate_content(
360                  prompt,
361                  generation_config=generation_config,
362                  safety_settings=safety_settings
363              )
364  
365              if not response.parts:
366                  print(f"Warning: AI response (attempt {attempt}) was empty or blocked. Prompt safety ratings: {response.prompt_feedback if hasattr(response, 'prompt_feedback') else 'N/A'}")
367                  if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
368                      print(f"Prompt blocked due to: {response.prompt_feedback.block_reason_message}")
369                  if attempt < max_retries:
370                      time.sleep( (2 ** attempt) * 2 )
371                      continue
372                  return []
373  
374  
375              response_text = response.text.strip()
376              if response_text.startswith("```json"):
377                  response_text = response_text[len("```json"):]
378              if response_text.endswith("```"):
379                  response_text = response_text[:-len("```")]
380              response_text = response_text.strip()
381  
382              # Log the raw response text before parsing, for debugging "no suggestions"
383              print(f"AI Response Text (attempt {attempt}, cleaned for JSON parsing):\n{response_text}")
384  
385              data = json.loads(response_text)
386  
387              if not isinstance(data, dict) or "reviews" not in data or not isinstance(data["reviews"], list):
388                  print(f"Error: AI response has invalid structure. Expected {{'reviews': [...]}}. Got: {type(data)}")
389                  if attempt < max_retries: time.sleep( (2 ** attempt) ); continue
390                  else: return []
391  
392              valid_reviews = []
393              for i, review_item in enumerate(data["reviews"]):
394                  if not isinstance(review_item, dict):
395                      print(f"Error: Review item {i} is not a dict: {review_item}")
396                      continue
397                  required_keys = ["hunkIndex", "lineNumber", "reviewComment", "confidence"]
398                  if not all(k in review_item for k in required_keys):
399                      print(f"Error: Review item {i} missing one or more required keys ({', '.join(required_keys)}): {review_item}")
400                      continue
401                  try:
402                      review_item["hunkIndex"] = int(review_item["hunkIndex"])
403                      review_item["lineNumber"] = int(review_item["lineNumber"])
404                  except ValueError:
405                      print(f"Error: Review item {i} hunkIndex or lineNumber not an int: {review_item}")
406                      continue
407                  if review_item["confidence"] not in ["High", "Medium", "Low"]:
408                      print(f"Warning: Review item {i} has invalid confidence '{review_item.get('confidence')}'. Defaulting to Low.")
409                      review_item["confidence"] = "Low"
410  
411                  valid_reviews.append(review_item)
412  
413              return valid_reviews
414  
415          except json.JSONDecodeError as e:
416              print(f"Error decoding JSON from AI response (attempt {attempt}): {e}")
417              response_text_for_log = "N/A"
418              # 'response_text' is already defined from the try block
419              if 'response_text' in locals() : response_text_for_log = response_text
420              elif 'response' in locals() and hasattr(response, 'text'): response_text_for_log = response.text
421  
422  
423              print(f"Response text that failed parsing (first 500 chars): '{response_text_for_log[:500]}'")
424              if attempt == max_retries: return []
425              time.sleep( (2 ** attempt) )
426          except Exception as e:
427              print(f"Error during Gemini API call (attempt {attempt}): {type(e).__name__} - {e}")
428              if "rate limit" in str(e).lower() or "429" in str(e) or "ResourceExhausted" in type(e).__name__:
429                  delay = (2 ** attempt) + random.uniform(0,1)
430                  print(f"Rate limit likely hit. Retrying in {delay:.2f} seconds...")
431                  time.sleep(delay)
432              elif attempt == max_retries:
433                   print(f"Max retries ({max_retries}) reached. Giving up on this API call.")
434                   return []
435              else:
436                   time.sleep(5 * attempt)
437  
438      return []
439  
440  
441  def analyze_code(files_to_review: Iterable[PatchedFile], pr_details: PRDetails) -> List[Dict[str, Any]]:
442      files_list = list(files_to_review)
443      print(f"Starting code analysis for {len(files_list)} files.")
444      all_comments_for_pr = []
445  
446      for patched_file in files_list:
447          if not patched_file.path or patched_file.path == "/dev/null":
448              print(f"Skipping file with invalid path: {patched_file.path}")
449              continue
450  
451          hunks_in_file = list(patched_file)
452          if not hunks_in_file:
453              print(f"No hunks in file {patched_file.path}, skipping.")
454              continue
455  
456          print(f"\nProcessing file: {patched_file.path} with {len(hunks_in_file)} hunks.")
457  
458          batch_prompt = create_batch_prompt(patched_file, pr_details)
459          ai_reviews_for_file = get_ai_response_with_retry(batch_prompt)
460  
461          if ai_reviews_for_file:
462              print(f"Received {len(ai_reviews_for_file)} review suggestions from AI for file {patched_file.path}.")
463              file_comments = process_batch_ai_reviews(patched_file, ai_reviews_for_file)
464              if file_comments:
465                  all_comments_for_pr.extend(file_comments)
466          else:
467              print(f"No review suggestions from AI for file {patched_file.path}.")
468  
469      print(f"\nFinished analysis. Total comments generated for PR: {len(all_comments_for_pr)}")
470      return all_comments_for_pr
471  
472  
473  def get_hunk_header_str(hunk: Hunk) -> str:
474      # A Hunk's string representation starts with its header: "@@ -old_start,old_len +new_start,new_len @@"
475      # Or constructs it if not directly available.
476      # For logging, it's useful.
477      return f"@@ -{hunk.source_start},{hunk.source_length} +{hunk.target_start},{hunk.target_length} @@"
478  
479  
480  def calculate_github_position(file_patch: PatchedFile, target_hunk_obj: Hunk, relative_line_number_in_hunk_content: int) -> Optional[int]:
481      cumulative_pos_in_diff = 0
482      hunks_in_file = list(file_patch)
483  
484      target_hunk_found = False
485      for current_hunk_obj in hunks_in_file:
486          cumulative_pos_in_diff += 1 
487  
488          if current_hunk_obj == target_hunk_obj:
489              target_hunk_found = True
490              comment_position = cumulative_pos_in_diff + relative_line_number_in_hunk_content -1
491  
492              num_content_lines_in_target_hunk = len(list(target_hunk_obj))
493              if not (1 <= relative_line_number_in_hunk_content <= num_content_lines_in_target_hunk):
494                  target_hunk_header_str = get_hunk_header_str(target_hunk_obj) # Use helper
495                  print(f"Warning: AI suggested line {relative_line_number_in_hunk_content} which is outside the actual "
496                        f"content lines ({num_content_lines_in_target_hunk}) of the target hunk in {file_patch.path}. "
497                        f"Target Hunk Header: {target_hunk_header_str.strip()}. Skipping this comment.")
498                  return None
499              return comment_position
500  
501          cumulative_pos_in_diff += len(list(current_hunk_obj))
502  
503      if not target_hunk_found:
504          target_hunk_header_str = get_hunk_header_str(target_hunk_obj) # Use helper
505          print(f"Error: Target hunk (header: {target_hunk_header_str.strip()}) not found by object comparison in file {file_patch.path} "
506                f"during position calculation.")
507      return None
508  
509  
510  def process_batch_ai_reviews(patched_file: PatchedFile, ai_reviews: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
511      comments_for_github = []
512      hunks_in_file = list(patched_file)
513  
514      for review_detail in ai_reviews:
515          try:
516              hunk_idx_from_ai = review_detail["hunkIndex"]
517              line_num_in_hunk_content = review_detail["lineNumber"]
518              comment_text = review_detail["reviewComment"]
519              confidence = review_detail["confidence"]
520  
521              if not (0 <= hunk_idx_from_ai < len(hunks_in_file)):
522                  print(f"Warning: AI returned out-of-bounds hunkIndex {hunk_idx_from_ai} for file {patched_file.path} "
523                        f"(has {len(hunks_in_file)} hunks). Skipping comment.")
524                  continue
525  
526              target_hunk_object = hunks_in_file[hunk_idx_from_ai]
527  
528              github_pos = calculate_github_position(patched_file, target_hunk_object, line_num_in_hunk_content)
529  
530              if github_pos is not None:
531                  formatted_comment_body = f"**AI Confidence: {confidence}**\n\n{comment_text}"
532  
533                  gh_comment = {
534                      "body": formatted_comment_body,
535                      "path": patched_file.path,
536                      "position": github_pos,
537                      "confidence_raw": confidence
538                  }
539                  comments_for_github.append(gh_comment)
540              else:
541                  print(f"Warning: Could not calculate GitHub position for comment in {patched_file.path}, "
542                        f"Hunk Index {hunk_idx_from_ai}, Line {line_num_in_hunk_content}. Skipping.")
543  
544          except KeyError as e:
545              print(f"Error processing AI review item due to missing key {e}: {review_detail}")
546          except Exception as e:
547              print(f"Unexpected error processing AI review item {review_detail}: {e}")
548              traceback.print_exc()
549  
550      return comments_for_github
551  
552  
553  def save_review_results_to_json(pr_details: PRDetails, comments: List[Dict[str, Any]], filepath_str: str = "reviews/gemini-pr-review.json") -> str:
554      filepath = Path(filepath_str)
555      filepath.parent.mkdir(parents=True, exist_ok=True)
556  
557      review_data = {
558          "metadata": {
559              "pr_number": pr_details.pull_number,
560              "repo": pr_details.get_full_repo_name(),
561              "title": pr_details.title,
562              "timestamp_utc": datetime.datetime.utcnow().isoformat() + "Z",
563              "review_tool": "Gemini AI Reviewer",
564              "model_used": os.environ.get('GEMINI_MODEL', 'N/A')
565          },
566          "review_comments": []
567      }
568  
569      for gh_comment_dict in comments:
570          structured_comment = {
571              "file_path": gh_comment_dict["path"],
572              "github_diff_position": gh_comment_dict["position"],
573              "comment_text_md": gh_comment_dict["body"],
574              "ai_confidence": gh_comment_dict.get("confidence_raw", "N/A"),
575              "detected_severity_heuristic": detect_severity(gh_comment_dict["body"]),
576              "detected_category_heuristic": detect_category(gh_comment_dict["body"])
577          }
578          review_data["review_comments"].append(structured_comment)
579  
580      with open(filepath, "w", encoding="utf-8") as f:
581          json.dump(review_data, f, indent=2)
582  
583      print(f"Review results saved to {filepath}")
584      return str(filepath)
585  
586  
587  def detect_severity(comment_text: str) -> str:
588      lower_text = comment_text.lower()
589      if any(word in lower_text for word in ["critical", "security vulnerability", "crash", "exploit", "must fix", "data loss"]):
590          return "critical"
591      if any(word in lower_text for word in ["bug", "error", "incorrect", "wrong", "security", "potential vulnerability", "flaw"]):
592          return "high"
593      if any(word in lower_text for word in ["performance", "optimization", "memory", "leak", "consider fixing", "confusing", "unclear"]):
594          return "medium"
595      return "low"
596  
597  def detect_category(comment_text: str) -> str:
598      lower_text = comment_text.lower()
599      if any(word in lower_text for word in ["security", "vulnerability", "exploit", "auth", "csrf", "xss", "injection", "password", "secret"]):
600          return "security"
601      if any(word in lower_text for word in ["performance", "slow", "optimization", "efficient", "memory", "cpu", "latency", "resource"]):
602          return "performance"
603      if any(word in lower_text for word in ["bug", "error", "incorrect", "wrong", "fix", "defect", "exception", "nullpointer"]):
604          return "bug"
605      if any(word in lower_text for word in ["style", "format", "naming", "convention", "readability", "clarity", "understandability", "documentation", "commenting"]):
606          return "style/clarity"
607      if any(word in lower_text for word in ["refactor", "clean", "simplify", "maintainability", "design", "architecture", "pattern", "anti-pattern", "duplication"]):
608          return "refactoring/design"
609      if any(word in lower_text for word in ["test", "coverage", "assertion", "mocking"]):
610          return "testing"
611      return "general"
612  
613  
614  def create_review_and_summary_comment(pr_details: PRDetails, comments_for_gh_review: List[Dict[str, Any]], review_json_path: str):
615      if not pr_details.pr_obj:
616          print("Error: PR object not available in PRDetails. Cannot create review or comments.")
617          return
618  
619      pr = pr_details.pr_obj
620      num_suggestions = len(comments_for_gh_review)
621  
622      if num_suggestions > 0:
623          valid_review_comments = []
624          for c in comments_for_gh_review:
625              if all(k in c for k in ["body", "path", "position"]):
626                  if isinstance(c["position"], int) and isinstance(c["path"], str) and isinstance(c["body"], str):
627                      valid_review_comments.append({
628                          "body": c["body"],
629                          "path": c["path"],
630                          "position": c["position"]
631                      })
632                  else:
633                      print(f"Warning: Skipping malformed comment due to type mismatch: {c}")
634              else:
635                  print(f"Warning: Skipping malformed comment due to missing keys: {c}")
636  
637          if valid_review_comments:
638              try:
639                  print(f"Creating a PR review with {len(valid_review_comments)} suggestions.")
640                  pr.create_review(
641                      body="Automated AI code review suggestions:",
642                      event="COMMENT",
643                      comments=valid_review_comments
644                  )
645                  print("Successfully created PR review with suggestions.")
646              except GithubException as e:
647                  print(f"Error creating PR review: {e}. Status: {e.status}, Data: {e.data}")
648                  print("Falling back to posting individual issue comments for suggestions.")
649                  for c_item in valid_review_comments:
650                      try:
651                          pr.create_issue_comment(f"**File:** `{c_item['path']}` (at diff position {c_item['position']})\n\n{c_item['body']}")
652                      except Exception as ie:
653                          print(f"Error posting individual suggestion as issue comment: {ie}")
654              except Exception as e:
655                  print(f"Unexpected error during PR review creation: {e}")
656                  traceback.print_exc()
657          else:
658              print("No validly structured comments to create a review with.")
659      else:
660          print("No suggestions to create a PR review for.")
661  
662      repo_full_name = os.environ.get("GITHUB_REPOSITORY", pr_details.get_full_repo_name())
663      server_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com")
664      branch_name = os.environ.get("GITHUB_HEAD_REF")
665      if not branch_name and hasattr(pr.head, 'ref'):
666          branch_name = pr.head.ref
667  
668      review_file_url_md = f"Review JSON file (`{review_json_path}` in the repository)"
669      if branch_name:
670          try:
671              encoded_branch = urllib.parse.quote_plus(branch_name)
672              review_file_url = f"{server_url}/{repo_full_name}/blob/{encoded_branch}/{review_json_path}"
673              review_file_url_md = f"Full review details in [`{review_json_path}`]({review_file_url})"
674              print(f"Summary comment will link to: {review_file_url}")
675          except Exception as url_e:
676              print(f"Error creating review file URL: {url_e}")
677      else:
678          print("Warning: Could not determine branch name for summary comment URL.")
679  
680  
681      summary_body = f"✨ **Gemini AI Code Review Complete** ✨\n\n"
682      if num_suggestions > 0:
683          summary_body += f"- Found {num_suggestions} potential areas for discussion/improvement (see review comments above or in the review tab).\n"
684      else:
685          summary_body += "- No specific suggestions made by the AI in this pass.\n"
686      summary_body += f"- {review_file_url_md}.\n"
687      summary_body += f"- Model: `{os.environ.get('GEMINI_MODEL', 'N/A')}`\n"
688      summary_body += f"- Reviewed at: {datetime.datetime.utcnow().isoformat()}Z\n"
689  
690      try:
691          pr.create_issue_comment(summary_body)
692          print("Successfully created summary comment on PR.")
693      except GithubException as e:
694          print(f"Error creating summary PR comment: {e}")
695      except Exception as e:
696          print(f"Unexpected error creating summary PR comment: {e}")
697          traceback.print_exc()
698  
699  
700  def parse_diff_to_patchset(diff_text: str) -> Optional[PatchSet]:
701      if not diff_text:
702          print("No diff text to parse.")
703          return None
704      try:
705          patch_set = PatchSet(diff_text)
706          print(f"Diff parsed into PatchSet with {len(list(patch_set))} patched files.")
707          return patch_set
708      except Exception as e:
709          print(f"Error parsing diff string with unidiff: {type(e).__name__} - {e}")
710          print(f"Diff text that failed (first 1000 chars): {diff_text[:1000]}")
711      return None
712  
713  
714  def main():
715      print("Starting AI Code Review Script...")
716      if not gh or not gemini_client_module:
717          print("Error: GitHub or Gemini client not available. Exiting.")
718          sys.exit(1)
719  
720      pr_details = get_pr_details()
721      print(f"Processing PR #{pr_details.pull_number} in repo {pr_details.get_full_repo_name()} (Event: {pr_details.event_type})")
722  
723      last_run_sha_from_env = os.environ.get("LAST_RUN_SHA", "").strip()
724      head_sha = pr_details.pr_obj.head.sha
725      base_sha = pr_details.pr_obj.base.sha
726  
727      comparison_sha_for_diff = None
728      if pr_details.event_type in ["opened", "reopened"]:
729          comparison_sha_for_diff = base_sha
730          print(f"Event type is '{pr_details.event_type}'. Reviewing full PR against base SHA: {comparison_sha_for_diff}")
731      elif pr_details.event_type == "synchronize":
732          if last_run_sha_from_env and last_run_sha_from_env != head_sha :
733              comparison_sha_for_diff = last_run_sha_from_env
734              print(f"Event type is 'synchronize'. Reviewing changes since last run SHA: {comparison_sha_for_diff}")
735          else:
736              comparison_sha_for_diff = base_sha
737              if not last_run_sha_from_env:
738                   print(f"Event type is 'synchronize', but no last_run_sha found. Reviewing full PR against base SHA: {comparison_sha_for_diff}")
739              elif last_run_sha_from_env == head_sha:
740                   print(f"Event type is 'synchronize', but last_run_sha ({last_run_sha_from_env}) is same as head_sha. No new commits for incremental review. Defaulting to full review against base SHA: {comparison_sha_for_diff}.")
741      else:
742          comparison_sha_for_diff = base_sha
743          print(f"Event type is '{pr_details.event_type}'. Defaulting to full review against base SHA: {comparison_sha_for_diff}")
744  
745      if head_sha == comparison_sha_for_diff:
746          print(f"HEAD SHA ({head_sha}) is the same as comparison SHA ({comparison_sha_for_diff}). No new changes to diff.")
747          save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json")
748          create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json")
749          print("Exiting as there are no new changes to review based on SHAs.")
750          return
751  
752      diff_text = get_diff(pr_details, comparison_sha_for_diff)
753      if not diff_text:
754          print("No diff content retrieved. Exiting review process.")
755          save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json")
756          create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json")
757          return
758  
759      initial_patch_set = parse_diff_to_patchset(diff_text)
760      if not initial_patch_set:
761          print("Failed to parse diff into PatchSet. Exiting.")
762          save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json")
763          sys.exit(1)
764  
765      exclude_patterns_str = os.environ.get("INPUT_EXCLUDE", "")
766      exclude_patterns = [p.strip() for p in exclude_patterns_str.split(',') if p.strip()]
767  
768      actual_files_to_process: List[PatchedFile] = []
769      for patched_file_obj in initial_patch_set:
770          normalized_path = patched_file_obj.path.lstrip('./')
771          is_excluded = False
772  
773          if patched_file_obj.is_removed_file or (patched_file_obj.is_added_file and patched_file_obj.target_file == '/dev/null'):
774              print(f"Skipping removed file (or added as /dev/null): {patched_file_obj.path}")
775              is_excluded = True
776          elif patched_file_obj.is_binary_file:
777              print(f"Excluding binary file: {patched_file_obj.path}")
778              is_excluded = True
779          else:
780              for pattern in exclude_patterns:
781                  if fnmatch.fnmatch(normalized_path, pattern) or fnmatch.fnmatch(patched_file_obj.path, pattern):
782                      print(f"Excluding file '{patched_file_obj.path}' due to pattern '{pattern}'.")
783                      is_excluded = True
784                      break
785          if not is_excluded:
786              actual_files_to_process.append(patched_file_obj)
787  
788      num_files_to_analyze = len(actual_files_to_process)
789      print(f"Number of files to analyze after exclusions: {num_files_to_analyze}")
790  
791      if num_files_to_analyze == 0:
792          print("No files to analyze after applying exclusion patterns.")
793          save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json")
794          create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json")
795          return
796  
797      comments_for_gh_review_api = analyze_code(actual_files_to_process, pr_details)
798  
799      review_json_filepath = "reviews/gemini-pr-review.json"
800      save_review_results_to_json(pr_details, comments_for_gh_review_api, review_json_filepath)
801      create_review_and_summary_comment(pr_details, comments_for_gh_review_api, review_json_filepath)
802  
803      print("AI Code Review Script finished.")
804  
805  
806  if __name__ == "__main__":
807      try:
808          main()
809      except SystemExit:
810          raise
811      except Exception as e:
812          print(f"Unhandled exception in __main__: {type(e).__name__} - {e}")
813          traceback.print_exc()
814          sys.exit(1)