gemini-pr-review.py
1 import json 2 import os 3 import sys 4 import time 5 import random 6 import datetime 7 import urllib.parse 8 import traceback 9 from pathlib import Path 10 from typing import List, Dict, Any, Optional, Tuple, Union, Iterable 11 import google.generativeai as Client 12 from github import Github, GithubException 13 import requests 14 import fnmatch 15 import re 16 from unidiff import Hunk, PatchedFile, PatchSet 17 from unidiff.patch import Line 18 19 # Initialize clients 20 def initialize_clients(): 21 try: 22 if os.environ.get("GEMINI_TEST_MODE") == "1": 23 print("Test mode: Skipping GitHub and Gemini client initialization") 24 return None, None 25 26 github_token = os.environ.get("GITHUB_TOKEN") 27 if not github_token: 28 print("Error: GITHUB_TOKEN environment variable is required.") 29 sys.exit(1) 30 gh_client = Github(github_token) 31 32 gemini_api_key = os.environ.get("GEMINI_API_KEY") 33 if not gemini_api_key: 34 print("Error: GEMINI_API_KEY environment variable is required.") 35 sys.exit(1) 36 Client.configure(api_key=gemini_api_key) 37 gemini_client_module = Client # Use the configured module 38 39 return gh_client, gemini_client_module 40 except Exception as e: 41 print(f"Error during client initialization: {e}") 42 traceback.print_exc() 43 sys.exit(1) 44 45 gh, gemini_client_module = initialize_clients() 46 47 48 class PRDetails: 49 def __init__(self, owner: str, repo_name_str: str, pull_number: int, title: str, description: str, repo_obj=None, pr_obj=None, event_type: str = None): 50 self.owner = owner 51 self.repo_name = repo_name_str 52 self.pull_number = pull_number 53 self.title = title 54 self.description = description 55 self.repo_obj = repo_obj 56 self.pr_obj = pr_obj 57 self.event_type = event_type 58 59 def get_full_repo_name(self): 60 return f"{self.owner}/{self.repo_name}" 61 62 63 def get_pr_details() -> PRDetails: 64 github_event_path = os.environ.get("GITHUB_EVENT_PATH") 65 if not github_event_path: 66 print("Error: GITHUB_EVENT_PATH environment variable not set.") 67 sys.exit(1) 68 69 with open(github_event_path, "r", encoding="utf-8") as f: 70 event_data = json.load(f) 71 72 event_name = os.environ.get("GITHUB_EVENT_NAME") 73 pr_event_type = None 74 75 if event_name == "issue_comment": 76 if "issue" in event_data and "pull_request" in event_data["issue"]: 77 pull_number = event_data["issue"]["number"] 78 repo_full_name = event_data["repository"]["full_name"] 79 pr_event_type = "comment" 80 else: 81 print("Error: issue_comment event not on a pull request.") 82 sys.exit(1) 83 elif event_name == "pull_request": 84 pull_number = event_data["pull_request"]["number"] 85 repo_full_name = event_data["repository"]["full_name"] 86 pr_event_type = event_data.get("action") 87 print(f"Pull request event action: {pr_event_type}") 88 else: 89 print(f"Error: Unsupported GITHUB_EVENT_NAME: {event_name}") 90 sys.exit(1) 91 92 owner, repo_name_str = repo_full_name.split("/") 93 94 try: 95 repo_obj = gh.get_repo(repo_full_name) 96 pr_obj = repo_obj.get_pull(pull_number) 97 except GithubException as e: 98 print(f"Error accessing GitHub repository or PR: {e}") 99 sys.exit(1) 100 except Exception as e: 101 print(f"An unexpected error occurred while fetching PR details: {e}") 102 sys.exit(1) 103 104 return PRDetails(owner, repo_name_str, pull_number, pr_obj.title, pr_obj.body or "", repo_obj, pr_obj, pr_event_type) 105 106 107 def get_diff(pr_details: PRDetails, comparison_sha: Optional[str] = None) -> str: 108 repo = pr_details.repo_obj 109 pr = pr_details.pr_obj 110 head_sha = pr.head.sha 111 112 if comparison_sha: 113 print(f"Getting diff comparing HEAD ({head_sha}) against specified SHA ({comparison_sha})") 114 try: 115 comparison_obj = repo.compare(comparison_sha, head_sha) 116 diff_parts = [] 117 for file_diff in comparison_obj.files: 118 if file_diff.patch: 119 # Construct a valid diff header format for unidiff 120 source_file_path_for_header = file_diff.previous_filename if file_diff.status == 'renamed' else file_diff.filename 121 target_file_path_for_header = file_diff.filename 122 123 diff_header = f"diff --git a/{source_file_path_for_header} b/{target_file_path_for_header}\n" 124 if file_diff.status == 'added': 125 diff_header += f"new file mode {getattr(file_diff, 'mode', '100644')}\n" 126 diff_header += f"index 0000000..{file_diff.sha[:7]}\n" 127 elif file_diff.status == 'deleted': 128 diff_header += f"deleted file mode {getattr(file_diff, 'mode', '100644')}\n" 129 diff_header += f"index {file_diff.sha[:7]}..0000000\n" 130 elif file_diff.status == 'renamed': 131 diff_header += f"similarity index {getattr(file_diff, 'similarity_index', '100')}%\n" 132 diff_header += f"rename from {source_file_path_for_header}\n" # already set as prev_filename 133 diff_header += f"rename to {target_file_path_for_header}\n" # already set as filename 134 if hasattr(file_diff, 'sha'): # If it's a rename with modifications 135 diff_header += f"index {getattr(file_diff, 'previous_sha', '0000000')[:7]}..{file_diff.sha[:7]}\n" 136 elif file_diff.status == 'modified': 137 # For modified files, the index line shows old SHA..new SHA 138 # PyGithub's file_diff.sha is the new SHA. We need the old one if available, 139 # or rely on the patch content itself to have it. 140 # For simplicity, we'll rely on the patch content for modified index line. 141 pass 142 143 144 patch_content = file_diff.patch 145 146 # Ensure --- and +++ lines are present, this is critical for unidiff 147 # The patch from GitHub API usually has these, but repo.compare() might be different. 148 lines = patch_content.splitlines() 149 final_patch_lines = [] 150 151 # Check if patch already contains valid ---/+++ for THESE filenames 152 # This logic can be complex if file_diff.patch is not a standard unidiff snippet 153 # For repo.compare, file_diff.patch should be a standard diff hunk content. 154 155 # Simplification: Assume file_diff.patch from repo.compare is the core hunk data 156 # and we need to wrap it correctly for unidiff. 157 final_patch_lines.append(f"--- a/{source_file_path_for_header}") 158 final_patch_lines.append(f"+++ b/{target_file_path_for_header}") 159 final_patch_lines.extend(lines) # Add the actual patch lines (hunks) 160 161 diff_parts.append(diff_header + "\n".join(final_patch_lines)) 162 163 if diff_parts: 164 diff_text = "\n".join(diff_parts) # Each element in diff_parts is a full diff for one file 165 print(f"Retrieved diff (length: {len(diff_text)}) using repo.compare('{comparison_sha}', '{head_sha}')") 166 return diff_text 167 else: 168 print(f"No changes found comparing {comparison_sha} to {head_sha}") 169 return "" 170 except GithubException as e: 171 print(f"Error getting comparison diff (compare {comparison_sha} vs {head_sha}): {e}. Falling back.") 172 except Exception as e: 173 print(f"Unexpected error during repo.compare: {e}. Falling back.") 174 traceback.print_exc() 175 176 177 print(f"Falling back to pr.get_diff() for PR #{pr_details.pull_number}") 178 try: 179 diff_text = pr.get_diff() # This is usually well-formatted for unidiff 180 if diff_text: 181 print(f"Retrieved diff (length: {len(diff_text)}) using pr.get_diff()") 182 return diff_text 183 else: 184 print("pr.get_diff() returned no content.") 185 return "" 186 except GithubException as e: 187 print(f"Error getting diff using pr.get_diff(): {e}. Falling back further.") 188 except Exception as e: 189 print(f"Unexpected error during pr.get_diff(): {e}. Falling back further.") 190 191 print(f"Falling back to direct API request for PR diff for PR #{pr_details.pull_number}") 192 api_url = f"https://api.github.com/repos/{pr_details.get_full_repo_name()}/pulls/{pr_details.pull_number}" 193 headers = { 194 'Authorization': f'token {os.environ["GITHUB_TOKEN"]}', 195 'Accept': 'application/vnd.github.v3.diff' 196 } 197 try: 198 response = requests.get(api_url, headers=headers, timeout=30) 199 response.raise_for_status() 200 diff_text = response.text 201 print(f"Retrieved diff (length: {len(diff_text)}) via direct API call.") 202 return diff_text 203 except requests.exceptions.RequestException as e: 204 print(f"Failed to get diff via direct API call: {e}") 205 except Exception as e: 206 print(f"Unexpected error during direct API call for diff: {e}") 207 208 print("All methods to retrieve diff failed.") 209 return "" 210 211 212 def get_hunk_representation(hunk: Hunk) -> str: 213 return str(hunk) 214 215 216 def get_file_content(file_path: str) -> str: 217 full_file_content = "" 218 code_extensions = [ 219 ".py", ".js", ".jsx", ".ts", ".tsx", ".html", ".css", ".scss", ".java", 220 ".c", ".cpp", ".h", ".hpp", ".go", ".rs", ".php", ".rb", ".sh", ".bash", 221 ".json", ".yml", ".yaml", ".toml", ".md" 222 ] 223 is_code_file = any(file_path.endswith(ext) for ext in code_extensions) 224 225 if not is_code_file: 226 print(f"Skipping full file context for non-code or binary-like file: {file_path}") 227 return "" 228 229 try: 230 p_file_path = Path(file_path) 231 if p_file_path.exists() and p_file_path.is_file(): 232 file_stat = p_file_path.stat() 233 max_initial_read_bytes = 300000 234 235 if file_stat.st_size > max_initial_read_bytes: 236 print(f"File {file_path} is very large ({file_stat.st_size} bytes). Reading a truncated version for context.") 237 with open(p_file_path, 'r', encoding='utf-8', errors='ignore') as f: 238 start_content = f.read(max_initial_read_bytes // 2) 239 full_file_content = start_content + "\n\n... [content truncated due to very large size] ...\n\n" 240 else: 241 with open(p_file_path, 'r', encoding='utf-8', errors='ignore') as f: 242 full_file_content = f.read() 243 244 max_char_len_for_context = 150000 245 if len(full_file_content) > max_char_len_for_context: 246 print(f"File content for {file_path} still too long after initial read ({len(full_file_content)} chars), further truncating for Gemini context.") 247 half_len = max_char_len_for_context // 2 248 full_file_content = full_file_content[:half_len] + \ 249 "\n\n... [content context truncated for brevity] ...\n\n" + \ 250 full_file_content[-half_len:] 251 252 print(f"Read file content for {file_path} (length: {len(full_file_content)} chars after potential truncation).") 253 else: 254 print(f"File {file_path} does not exist locally or is not a file. Cannot provide full context.") 255 except Exception as e: 256 print(f"Error reading full file content for {file_path}: {e}") 257 traceback.print_exc() 258 return full_file_content 259 260 261 def create_batch_prompt(patched_file: PatchedFile, pr_details: PRDetails) -> str: 262 full_file_content_for_context = get_file_content(patched_file.path) 263 264 combined_hunks_text = "" 265 for i, hunk in enumerate(patched_file): 266 hunk_text = get_hunk_representation(hunk) 267 if not hunk_text.strip(): 268 continue 269 270 separator = ("-" * 20) + f" Hunk {i+1} (0-indexed: {i}) " + ("-" * 20) + "\n" 271 combined_hunks_text += ("\n\n" if i > 0 else "") + separator + hunk_text 272 273 instructions = """Your task is reviewing pull requests. Instructions: 274 - Provide the response in the following JSON format: {"reviews": [{"hunkIndex": <hunk_index_0_based>, "lineNumber": <line_number_in_hunk_content_1_based>, "reviewComment": "<review_comment_using_github_markdown>", "confidence": "<High|Medium|Low>"}]} 275 - `hunkIndex` is 0-based, referring to which hunk in the *provided diff below* the comment applies to (matches the 'Hunk X (0-indexed: Y)' header). 276 - `lineNumber` is 1-based, relative to the *content lines* within that specific hunk (i.e., line 1 is the first line *after* the '@@ ... @@' header of that hunk). These are the lines starting with '+', '-', or space. 277 - `confidence` indicates your certainty and the potential impact: "High" (likely critical issue), "Medium" (potential issue/best practice), "Low" (minor suggestion/nitpick). 278 - Provide comments if there is something genuinely to improve or discuss. If no issues, "reviews" should be an empty array. Consider the severity of the issue when deciding to comment. 279 - Use GitHub Markdown for `reviewComment`. 280 - Focus on: bugs, security vulnerabilities, performance bottlenecks, unclear logic, anti-patterns, and violations of SOLID principles or other key design patterns. High-impact issues are preferred. 281 - Make comments actionable. Suggest improvements or ask clarifying questions. 282 - DO NOT suggest adding comments to the code itself (e.g., "add a comment here explaining X"). 283 - NOTE: Basic formatting/linting is handled by Biome. Focus on substantive issues. Do not comment on minor style issues. You are reviewing the *final* auto-formatted/linted code. 284 - Carefully analyze the full file context (if provided) and PR context before making suggestions to avoid hallucinations or irrelevant points. 285 - Only suggest changes relevant to the diff. Do not comment on unrelated code unless directly impacted by the changes in the diff. 286 - Be concise and clear. 287 """ 288 289 pr_context = f"\nPull Request Title: {pr_details.title}\nPull Request Description:\n---\n{pr_details.description or 'No description provided.'}\n---\n" 290 291 file_context_header = "" 292 file_content_block = "" 293 if full_file_content_for_context: 294 file_context_header = "\nFull content of the file for better context (it may be truncated if too large):\n" 295 file_ext = Path(patched_file.path).suffix[1:] 296 file_content_block = f"```{file_ext or 'text'}\n{full_file_content_for_context}\n```\n" 297 298 diff_to_review_header = f"\nReview the following code diffs for the file \"{patched_file.path}\" ({len(list(patched_file))} hunks):\n" 299 diff_block = f"```diff\n{combined_hunks_text}\n```" 300 301 return instructions + pr_context + file_context_header + file_content_block + diff_to_review_header + diff_block 302 303 304 LAST_GEMINI_REQUEST_TIME = 0 305 GEMINI_RPM_LIMIT = 45 306 GEMINI_REQUEST_INTERVAL_SECONDS = 60.0 / GEMINI_RPM_LIMIT 307 308 def enforce_gemini_rate_limits(): 309 global LAST_GEMINI_REQUEST_TIME 310 current_time = time.time() 311 time_since_last = current_time - LAST_GEMINI_REQUEST_TIME 312 if time_since_last < GEMINI_REQUEST_INTERVAL_SECONDS: 313 wait_time = GEMINI_REQUEST_INTERVAL_SECONDS - time_since_last 314 print(f"Gemini Rate Limiter: Waiting {wait_time:.2f} seconds.") 315 time.sleep(wait_time) 316 LAST_GEMINI_REQUEST_TIME = time.time() 317 318 319 def get_ai_response_with_retry(prompt: str, max_retries: int = 3) -> List[Dict[str, Any]]: 320 model_name = os.environ.get('GEMINI_MODEL', 'gemini-1.5-flash-latest') 321 322 if not gemini_client_module: 323 print("Error: Gemini client module not initialized. Cannot make API call.") 324 return [] 325 326 try: 327 gemini_model = gemini_client_module.GenerativeModel(model_name) 328 except Exception as e: 329 print(f"Error creating GenerativeModel instance with {model_name}: {e}") 330 return [] 331 332 generation_config = { 333 "max_output_tokens": 8192, 334 "temperature": 0.5, # Increased slightly from 0.4 335 "top_p": 0.95, 336 } 337 338 safety_settings = [ 339 {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, 340 {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, 341 {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, 342 {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, 343 ] 344 345 # Log the prompt before sending 346 # For very long prompts, log only a summary or start/end 347 prompt_log_max_len = 2000 348 if len(prompt) > prompt_log_max_len: 349 print(f"Full prompt (length {len(prompt)}). Start:\n{prompt[:prompt_log_max_len//2]}...\n...End:\n{prompt[-(prompt_log_max_len//2):]}") 350 else: 351 print(f"Full prompt:\n{prompt}") 352 353 354 for attempt in range(1, max_retries + 1): 355 try: 356 enforce_gemini_rate_limits() 357 print(f"Attempt {attempt}/{max_retries} - Sending prompt to Gemini model {model_name}...") 358 359 response = gemini_model.generate_content( 360 prompt, 361 generation_config=generation_config, 362 safety_settings=safety_settings 363 ) 364 365 if not response.parts: 366 print(f"Warning: AI response (attempt {attempt}) was empty or blocked. Prompt safety ratings: {response.prompt_feedback if hasattr(response, 'prompt_feedback') else 'N/A'}") 367 if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason: 368 print(f"Prompt blocked due to: {response.prompt_feedback.block_reason_message}") 369 if attempt < max_retries: 370 time.sleep( (2 ** attempt) * 2 ) 371 continue 372 return [] 373 374 375 response_text = response.text.strip() 376 if response_text.startswith("```json"): 377 response_text = response_text[len("```json"):] 378 if response_text.endswith("```"): 379 response_text = response_text[:-len("```")] 380 response_text = response_text.strip() 381 382 # Log the raw response text before parsing, for debugging "no suggestions" 383 print(f"AI Response Text (attempt {attempt}, cleaned for JSON parsing):\n{response_text}") 384 385 data = json.loads(response_text) 386 387 if not isinstance(data, dict) or "reviews" not in data or not isinstance(data["reviews"], list): 388 print(f"Error: AI response has invalid structure. Expected {{'reviews': [...]}}. Got: {type(data)}") 389 if attempt < max_retries: time.sleep( (2 ** attempt) ); continue 390 else: return [] 391 392 valid_reviews = [] 393 for i, review_item in enumerate(data["reviews"]): 394 if not isinstance(review_item, dict): 395 print(f"Error: Review item {i} is not a dict: {review_item}") 396 continue 397 required_keys = ["hunkIndex", "lineNumber", "reviewComment", "confidence"] 398 if not all(k in review_item for k in required_keys): 399 print(f"Error: Review item {i} missing one or more required keys ({', '.join(required_keys)}): {review_item}") 400 continue 401 try: 402 review_item["hunkIndex"] = int(review_item["hunkIndex"]) 403 review_item["lineNumber"] = int(review_item["lineNumber"]) 404 except ValueError: 405 print(f"Error: Review item {i} hunkIndex or lineNumber not an int: {review_item}") 406 continue 407 if review_item["confidence"] not in ["High", "Medium", "Low"]: 408 print(f"Warning: Review item {i} has invalid confidence '{review_item.get('confidence')}'. Defaulting to Low.") 409 review_item["confidence"] = "Low" 410 411 valid_reviews.append(review_item) 412 413 return valid_reviews 414 415 except json.JSONDecodeError as e: 416 print(f"Error decoding JSON from AI response (attempt {attempt}): {e}") 417 response_text_for_log = "N/A" 418 # 'response_text' is already defined from the try block 419 if 'response_text' in locals() : response_text_for_log = response_text 420 elif 'response' in locals() and hasattr(response, 'text'): response_text_for_log = response.text 421 422 423 print(f"Response text that failed parsing (first 500 chars): '{response_text_for_log[:500]}'") 424 if attempt == max_retries: return [] 425 time.sleep( (2 ** attempt) ) 426 except Exception as e: 427 print(f"Error during Gemini API call (attempt {attempt}): {type(e).__name__} - {e}") 428 if "rate limit" in str(e).lower() or "429" in str(e) or "ResourceExhausted" in type(e).__name__: 429 delay = (2 ** attempt) + random.uniform(0,1) 430 print(f"Rate limit likely hit. Retrying in {delay:.2f} seconds...") 431 time.sleep(delay) 432 elif attempt == max_retries: 433 print(f"Max retries ({max_retries}) reached. Giving up on this API call.") 434 return [] 435 else: 436 time.sleep(5 * attempt) 437 438 return [] 439 440 441 def analyze_code(files_to_review: Iterable[PatchedFile], pr_details: PRDetails) -> List[Dict[str, Any]]: 442 files_list = list(files_to_review) 443 print(f"Starting code analysis for {len(files_list)} files.") 444 all_comments_for_pr = [] 445 446 for patched_file in files_list: 447 if not patched_file.path or patched_file.path == "/dev/null": 448 print(f"Skipping file with invalid path: {patched_file.path}") 449 continue 450 451 hunks_in_file = list(patched_file) 452 if not hunks_in_file: 453 print(f"No hunks in file {patched_file.path}, skipping.") 454 continue 455 456 print(f"\nProcessing file: {patched_file.path} with {len(hunks_in_file)} hunks.") 457 458 batch_prompt = create_batch_prompt(patched_file, pr_details) 459 ai_reviews_for_file = get_ai_response_with_retry(batch_prompt) 460 461 if ai_reviews_for_file: 462 print(f"Received {len(ai_reviews_for_file)} review suggestions from AI for file {patched_file.path}.") 463 file_comments = process_batch_ai_reviews(patched_file, ai_reviews_for_file) 464 if file_comments: 465 all_comments_for_pr.extend(file_comments) 466 else: 467 print(f"No review suggestions from AI for file {patched_file.path}.") 468 469 print(f"\nFinished analysis. Total comments generated for PR: {len(all_comments_for_pr)}") 470 return all_comments_for_pr 471 472 473 def get_hunk_header_str(hunk: Hunk) -> str: 474 # A Hunk's string representation starts with its header: "@@ -old_start,old_len +new_start,new_len @@" 475 # Or constructs it if not directly available. 476 # For logging, it's useful. 477 return f"@@ -{hunk.source_start},{hunk.source_length} +{hunk.target_start},{hunk.target_length} @@" 478 479 480 def calculate_github_position(file_patch: PatchedFile, target_hunk_obj: Hunk, relative_line_number_in_hunk_content: int) -> Optional[int]: 481 cumulative_pos_in_diff = 0 482 hunks_in_file = list(file_patch) 483 484 target_hunk_found = False 485 for current_hunk_obj in hunks_in_file: 486 cumulative_pos_in_diff += 1 487 488 if current_hunk_obj == target_hunk_obj: 489 target_hunk_found = True 490 comment_position = cumulative_pos_in_diff + relative_line_number_in_hunk_content -1 491 492 num_content_lines_in_target_hunk = len(list(target_hunk_obj)) 493 if not (1 <= relative_line_number_in_hunk_content <= num_content_lines_in_target_hunk): 494 target_hunk_header_str = get_hunk_header_str(target_hunk_obj) # Use helper 495 print(f"Warning: AI suggested line {relative_line_number_in_hunk_content} which is outside the actual " 496 f"content lines ({num_content_lines_in_target_hunk}) of the target hunk in {file_patch.path}. " 497 f"Target Hunk Header: {target_hunk_header_str.strip()}. Skipping this comment.") 498 return None 499 return comment_position 500 501 cumulative_pos_in_diff += len(list(current_hunk_obj)) 502 503 if not target_hunk_found: 504 target_hunk_header_str = get_hunk_header_str(target_hunk_obj) # Use helper 505 print(f"Error: Target hunk (header: {target_hunk_header_str.strip()}) not found by object comparison in file {file_patch.path} " 506 f"during position calculation.") 507 return None 508 509 510 def process_batch_ai_reviews(patched_file: PatchedFile, ai_reviews: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 511 comments_for_github = [] 512 hunks_in_file = list(patched_file) 513 514 for review_detail in ai_reviews: 515 try: 516 hunk_idx_from_ai = review_detail["hunkIndex"] 517 line_num_in_hunk_content = review_detail["lineNumber"] 518 comment_text = review_detail["reviewComment"] 519 confidence = review_detail["confidence"] 520 521 if not (0 <= hunk_idx_from_ai < len(hunks_in_file)): 522 print(f"Warning: AI returned out-of-bounds hunkIndex {hunk_idx_from_ai} for file {patched_file.path} " 523 f"(has {len(hunks_in_file)} hunks). Skipping comment.") 524 continue 525 526 target_hunk_object = hunks_in_file[hunk_idx_from_ai] 527 528 github_pos = calculate_github_position(patched_file, target_hunk_object, line_num_in_hunk_content) 529 530 if github_pos is not None: 531 formatted_comment_body = f"**AI Confidence: {confidence}**\n\n{comment_text}" 532 533 gh_comment = { 534 "body": formatted_comment_body, 535 "path": patched_file.path, 536 "position": github_pos, 537 "confidence_raw": confidence 538 } 539 comments_for_github.append(gh_comment) 540 else: 541 print(f"Warning: Could not calculate GitHub position for comment in {patched_file.path}, " 542 f"Hunk Index {hunk_idx_from_ai}, Line {line_num_in_hunk_content}. Skipping.") 543 544 except KeyError as e: 545 print(f"Error processing AI review item due to missing key {e}: {review_detail}") 546 except Exception as e: 547 print(f"Unexpected error processing AI review item {review_detail}: {e}") 548 traceback.print_exc() 549 550 return comments_for_github 551 552 553 def save_review_results_to_json(pr_details: PRDetails, comments: List[Dict[str, Any]], filepath_str: str = "reviews/gemini-pr-review.json") -> str: 554 filepath = Path(filepath_str) 555 filepath.parent.mkdir(parents=True, exist_ok=True) 556 557 review_data = { 558 "metadata": { 559 "pr_number": pr_details.pull_number, 560 "repo": pr_details.get_full_repo_name(), 561 "title": pr_details.title, 562 "timestamp_utc": datetime.datetime.utcnow().isoformat() + "Z", 563 "review_tool": "Gemini AI Reviewer", 564 "model_used": os.environ.get('GEMINI_MODEL', 'N/A') 565 }, 566 "review_comments": [] 567 } 568 569 for gh_comment_dict in comments: 570 structured_comment = { 571 "file_path": gh_comment_dict["path"], 572 "github_diff_position": gh_comment_dict["position"], 573 "comment_text_md": gh_comment_dict["body"], 574 "ai_confidence": gh_comment_dict.get("confidence_raw", "N/A"), 575 "detected_severity_heuristic": detect_severity(gh_comment_dict["body"]), 576 "detected_category_heuristic": detect_category(gh_comment_dict["body"]) 577 } 578 review_data["review_comments"].append(structured_comment) 579 580 with open(filepath, "w", encoding="utf-8") as f: 581 json.dump(review_data, f, indent=2) 582 583 print(f"Review results saved to {filepath}") 584 return str(filepath) 585 586 587 def detect_severity(comment_text: str) -> str: 588 lower_text = comment_text.lower() 589 if any(word in lower_text for word in ["critical", "security vulnerability", "crash", "exploit", "must fix", "data loss"]): 590 return "critical" 591 if any(word in lower_text for word in ["bug", "error", "incorrect", "wrong", "security", "potential vulnerability", "flaw"]): 592 return "high" 593 if any(word in lower_text for word in ["performance", "optimization", "memory", "leak", "consider fixing", "confusing", "unclear"]): 594 return "medium" 595 return "low" 596 597 def detect_category(comment_text: str) -> str: 598 lower_text = comment_text.lower() 599 if any(word in lower_text for word in ["security", "vulnerability", "exploit", "auth", "csrf", "xss", "injection", "password", "secret"]): 600 return "security" 601 if any(word in lower_text for word in ["performance", "slow", "optimization", "efficient", "memory", "cpu", "latency", "resource"]): 602 return "performance" 603 if any(word in lower_text for word in ["bug", "error", "incorrect", "wrong", "fix", "defect", "exception", "nullpointer"]): 604 return "bug" 605 if any(word in lower_text for word in ["style", "format", "naming", "convention", "readability", "clarity", "understandability", "documentation", "commenting"]): 606 return "style/clarity" 607 if any(word in lower_text for word in ["refactor", "clean", "simplify", "maintainability", "design", "architecture", "pattern", "anti-pattern", "duplication"]): 608 return "refactoring/design" 609 if any(word in lower_text for word in ["test", "coverage", "assertion", "mocking"]): 610 return "testing" 611 return "general" 612 613 614 def create_review_and_summary_comment(pr_details: PRDetails, comments_for_gh_review: List[Dict[str, Any]], review_json_path: str): 615 if not pr_details.pr_obj: 616 print("Error: PR object not available in PRDetails. Cannot create review or comments.") 617 return 618 619 pr = pr_details.pr_obj 620 num_suggestions = len(comments_for_gh_review) 621 622 if num_suggestions > 0: 623 valid_review_comments = [] 624 for c in comments_for_gh_review: 625 if all(k in c for k in ["body", "path", "position"]): 626 if isinstance(c["position"], int) and isinstance(c["path"], str) and isinstance(c["body"], str): 627 valid_review_comments.append({ 628 "body": c["body"], 629 "path": c["path"], 630 "position": c["position"] 631 }) 632 else: 633 print(f"Warning: Skipping malformed comment due to type mismatch: {c}") 634 else: 635 print(f"Warning: Skipping malformed comment due to missing keys: {c}") 636 637 if valid_review_comments: 638 try: 639 print(f"Creating a PR review with {len(valid_review_comments)} suggestions.") 640 pr.create_review( 641 body="Automated AI code review suggestions:", 642 event="COMMENT", 643 comments=valid_review_comments 644 ) 645 print("Successfully created PR review with suggestions.") 646 except GithubException as e: 647 print(f"Error creating PR review: {e}. Status: {e.status}, Data: {e.data}") 648 print("Falling back to posting individual issue comments for suggestions.") 649 for c_item in valid_review_comments: 650 try: 651 pr.create_issue_comment(f"**File:** `{c_item['path']}` (at diff position {c_item['position']})\n\n{c_item['body']}") 652 except Exception as ie: 653 print(f"Error posting individual suggestion as issue comment: {ie}") 654 except Exception as e: 655 print(f"Unexpected error during PR review creation: {e}") 656 traceback.print_exc() 657 else: 658 print("No validly structured comments to create a review with.") 659 else: 660 print("No suggestions to create a PR review for.") 661 662 repo_full_name = os.environ.get("GITHUB_REPOSITORY", pr_details.get_full_repo_name()) 663 server_url = os.environ.get("GITHUB_SERVER_URL", "https://github.com") 664 branch_name = os.environ.get("GITHUB_HEAD_REF") 665 if not branch_name and hasattr(pr.head, 'ref'): 666 branch_name = pr.head.ref 667 668 review_file_url_md = f"Review JSON file (`{review_json_path}` in the repository)" 669 if branch_name: 670 try: 671 encoded_branch = urllib.parse.quote_plus(branch_name) 672 review_file_url = f"{server_url}/{repo_full_name}/blob/{encoded_branch}/{review_json_path}" 673 review_file_url_md = f"Full review details in [`{review_json_path}`]({review_file_url})" 674 print(f"Summary comment will link to: {review_file_url}") 675 except Exception as url_e: 676 print(f"Error creating review file URL: {url_e}") 677 else: 678 print("Warning: Could not determine branch name for summary comment URL.") 679 680 681 summary_body = f"✨ **Gemini AI Code Review Complete** ✨\n\n" 682 if num_suggestions > 0: 683 summary_body += f"- Found {num_suggestions} potential areas for discussion/improvement (see review comments above or in the review tab).\n" 684 else: 685 summary_body += "- No specific suggestions made by the AI in this pass.\n" 686 summary_body += f"- {review_file_url_md}.\n" 687 summary_body += f"- Model: `{os.environ.get('GEMINI_MODEL', 'N/A')}`\n" 688 summary_body += f"- Reviewed at: {datetime.datetime.utcnow().isoformat()}Z\n" 689 690 try: 691 pr.create_issue_comment(summary_body) 692 print("Successfully created summary comment on PR.") 693 except GithubException as e: 694 print(f"Error creating summary PR comment: {e}") 695 except Exception as e: 696 print(f"Unexpected error creating summary PR comment: {e}") 697 traceback.print_exc() 698 699 700 def parse_diff_to_patchset(diff_text: str) -> Optional[PatchSet]: 701 if not diff_text: 702 print("No diff text to parse.") 703 return None 704 try: 705 patch_set = PatchSet(diff_text) 706 print(f"Diff parsed into PatchSet with {len(list(patch_set))} patched files.") 707 return patch_set 708 except Exception as e: 709 print(f"Error parsing diff string with unidiff: {type(e).__name__} - {e}") 710 print(f"Diff text that failed (first 1000 chars): {diff_text[:1000]}") 711 return None 712 713 714 def main(): 715 print("Starting AI Code Review Script...") 716 if not gh or not gemini_client_module: 717 print("Error: GitHub or Gemini client not available. Exiting.") 718 sys.exit(1) 719 720 pr_details = get_pr_details() 721 print(f"Processing PR #{pr_details.pull_number} in repo {pr_details.get_full_repo_name()} (Event: {pr_details.event_type})") 722 723 last_run_sha_from_env = os.environ.get("LAST_RUN_SHA", "").strip() 724 head_sha = pr_details.pr_obj.head.sha 725 base_sha = pr_details.pr_obj.base.sha 726 727 comparison_sha_for_diff = None 728 if pr_details.event_type in ["opened", "reopened"]: 729 comparison_sha_for_diff = base_sha 730 print(f"Event type is '{pr_details.event_type}'. Reviewing full PR against base SHA: {comparison_sha_for_diff}") 731 elif pr_details.event_type == "synchronize": 732 if last_run_sha_from_env and last_run_sha_from_env != head_sha : 733 comparison_sha_for_diff = last_run_sha_from_env 734 print(f"Event type is 'synchronize'. Reviewing changes since last run SHA: {comparison_sha_for_diff}") 735 else: 736 comparison_sha_for_diff = base_sha 737 if not last_run_sha_from_env: 738 print(f"Event type is 'synchronize', but no last_run_sha found. Reviewing full PR against base SHA: {comparison_sha_for_diff}") 739 elif last_run_sha_from_env == head_sha: 740 print(f"Event type is 'synchronize', but last_run_sha ({last_run_sha_from_env}) is same as head_sha. No new commits for incremental review. Defaulting to full review against base SHA: {comparison_sha_for_diff}.") 741 else: 742 comparison_sha_for_diff = base_sha 743 print(f"Event type is '{pr_details.event_type}'. Defaulting to full review against base SHA: {comparison_sha_for_diff}") 744 745 if head_sha == comparison_sha_for_diff: 746 print(f"HEAD SHA ({head_sha}) is the same as comparison SHA ({comparison_sha_for_diff}). No new changes to diff.") 747 save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json") 748 create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json") 749 print("Exiting as there are no new changes to review based on SHAs.") 750 return 751 752 diff_text = get_diff(pr_details, comparison_sha_for_diff) 753 if not diff_text: 754 print("No diff content retrieved. Exiting review process.") 755 save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json") 756 create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json") 757 return 758 759 initial_patch_set = parse_diff_to_patchset(diff_text) 760 if not initial_patch_set: 761 print("Failed to parse diff into PatchSet. Exiting.") 762 save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json") 763 sys.exit(1) 764 765 exclude_patterns_str = os.environ.get("INPUT_EXCLUDE", "") 766 exclude_patterns = [p.strip() for p in exclude_patterns_str.split(',') if p.strip()] 767 768 actual_files_to_process: List[PatchedFile] = [] 769 for patched_file_obj in initial_patch_set: 770 normalized_path = patched_file_obj.path.lstrip('./') 771 is_excluded = False 772 773 if patched_file_obj.is_removed_file or (patched_file_obj.is_added_file and patched_file_obj.target_file == '/dev/null'): 774 print(f"Skipping removed file (or added as /dev/null): {patched_file_obj.path}") 775 is_excluded = True 776 elif patched_file_obj.is_binary_file: 777 print(f"Excluding binary file: {patched_file_obj.path}") 778 is_excluded = True 779 else: 780 for pattern in exclude_patterns: 781 if fnmatch.fnmatch(normalized_path, pattern) or fnmatch.fnmatch(patched_file_obj.path, pattern): 782 print(f"Excluding file '{patched_file_obj.path}' due to pattern '{pattern}'.") 783 is_excluded = True 784 break 785 if not is_excluded: 786 actual_files_to_process.append(patched_file_obj) 787 788 num_files_to_analyze = len(actual_files_to_process) 789 print(f"Number of files to analyze after exclusions: {num_files_to_analyze}") 790 791 if num_files_to_analyze == 0: 792 print("No files to analyze after applying exclusion patterns.") 793 save_review_results_to_json(pr_details, [], "reviews/gemini-pr-review.json") 794 create_review_and_summary_comment(pr_details, [], "reviews/gemini-pr-review.json") 795 return 796 797 comments_for_gh_review_api = analyze_code(actual_files_to_process, pr_details) 798 799 review_json_filepath = "reviews/gemini-pr-review.json" 800 save_review_results_to_json(pr_details, comments_for_gh_review_api, review_json_filepath) 801 create_review_and_summary_comment(pr_details, comments_for_gh_review_api, review_json_filepath) 802 803 print("AI Code Review Script finished.") 804 805 806 if __name__ == "__main__": 807 try: 808 main() 809 except SystemExit: 810 raise 811 except Exception as e: 812 print(f"Unhandled exception in __main__: {type(e).__name__} - {e}") 813 traceback.print_exc() 814 sys.exit(1)