disassembler_objdump.cc
1 // Copyright (c) 2022, Google LLC 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google LLC nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 // disassembler_objdump.: Disassembler that invokes objdump for disassembly. 30 // 31 // Author: Mark Brand 32 33 #ifdef HAVE_CONFIG_H 34 #include <config.h> // Must come first 35 #endif 36 37 #include "processor/disassembler_objdump.h" 38 39 #include <unistd.h> 40 #include <sys/wait.h> 41 42 #include <array> 43 #include <fstream> 44 #include <iostream> 45 #include <iterator> 46 #include <regex> 47 #include <sstream> 48 #include <vector> 49 50 #include "common/linux/eintr_wrapper.h" 51 #include "common/linux/scoped_pipe.h" 52 #include "common/linux/scoped_tmpfile.h" 53 #include "processor/logging.h" 54 55 namespace google_breakpad { 56 namespace { 57 58 const size_t kMaxX86InstructionLength = 15; 59 60 bool IsInstructionPrefix(const string& token) { 61 if (token == "lock" || token == "rep" || token == "repz" || 62 token == "repnz") { 63 return true; 64 } 65 return false; 66 } 67 68 bool IsOperandSize(const string& token) { 69 if (token == "BYTE" || token == "WORD" || token == "DWORD" || 70 token == "QWORD" || token == "PTR") { 71 return true; 72 } 73 return false; 74 } 75 76 bool GetSegmentAddressX86(const DumpContext& context, string segment_name, 77 uint64_t& address) { 78 if (segment_name == "ds") { 79 address = context.GetContextX86()->ds; 80 } else if (segment_name == "es") { 81 address = context.GetContextX86()->es; 82 } else if (segment_name == "fs") { 83 address = context.GetContextX86()->fs; 84 } else if (segment_name == "gs") { 85 address = context.GetContextX86()->gs; 86 } else { 87 BPLOG(ERROR) << "Unsupported segment register: " << segment_name; 88 return false; 89 } 90 91 return true; 92 } 93 94 bool GetSegmentAddressAMD64(const DumpContext& context, string segment_name, 95 uint64_t& address) { 96 if (segment_name == "ds") { 97 address = 0; 98 } else if (segment_name == "es") { 99 address = 0; 100 } else { 101 BPLOG(ERROR) << "Unsupported segment register: " << segment_name; 102 return false; 103 } 104 105 return true; 106 } 107 108 bool GetSegmentAddress(const DumpContext& context, string segment_name, 109 uint64_t& address) { 110 if (context.GetContextCPU() == MD_CONTEXT_X86) { 111 return GetSegmentAddressX86(context, segment_name, address); 112 } else if (context.GetContextCPU() == MD_CONTEXT_AMD64) { 113 return GetSegmentAddressAMD64(context, segment_name, address); 114 } else { 115 BPLOG(ERROR) << "Unsupported architecture for GetSegmentAddress\n"; 116 return false; 117 } 118 } 119 120 bool GetRegisterValueX86(const DumpContext& context, string register_name, 121 uint64_t& value) { 122 if (register_name == "eax") { 123 value = context.GetContextX86()->eax; 124 } else if (register_name == "ebx") { 125 value = context.GetContextX86()->ebx; 126 } else if (register_name == "ecx") { 127 value = context.GetContextX86()->ecx; 128 } else if (register_name == "edx") { 129 value = context.GetContextX86()->edx; 130 } else if (register_name == "edi") { 131 value = context.GetContextX86()->edi; 132 } else if (register_name == "esi") { 133 value = context.GetContextX86()->esi; 134 } else if (register_name == "ebp") { 135 value = context.GetContextX86()->ebp; 136 } else if (register_name == "esp") { 137 value = context.GetContextX86()->esp; 138 } else if (register_name == "eip") { 139 value = context.GetContextX86()->eip; 140 } else { 141 BPLOG(ERROR) << "Unsupported register: " << register_name; 142 return false; 143 } 144 145 return true; 146 } 147 148 bool GetRegisterValueAMD64(const DumpContext& context, string register_name, 149 uint64_t& value) { 150 if (register_name == "rax") { 151 value = context.GetContextAMD64()->rax; 152 } else if (register_name == "rbx") { 153 value = context.GetContextAMD64()->rbx; 154 } else if (register_name == "rcx") { 155 value = context.GetContextAMD64()->rcx; 156 } else if (register_name == "rdx") { 157 value = context.GetContextAMD64()->rdx; 158 } else if (register_name == "rdi") { 159 value = context.GetContextAMD64()->rdi; 160 } else if (register_name == "rsi") { 161 value = context.GetContextAMD64()->rsi; 162 } else if (register_name == "rbp") { 163 value = context.GetContextAMD64()->rbp; 164 } else if (register_name == "rsp") { 165 value = context.GetContextAMD64()->rsp; 166 } else if (register_name == "r8") { 167 value = context.GetContextAMD64()->r8; 168 } else if (register_name == "r9") { 169 value = context.GetContextAMD64()->r9; 170 } else if (register_name == "r10") { 171 value = context.GetContextAMD64()->r10; 172 } else if (register_name == "r11") { 173 value = context.GetContextAMD64()->r11; 174 } else if (register_name == "r12") { 175 value = context.GetContextAMD64()->r12; 176 } else if (register_name == "r13") { 177 value = context.GetContextAMD64()->r13; 178 } else if (register_name == "r14") { 179 value = context.GetContextAMD64()->r14; 180 } else if (register_name == "r15") { 181 value = context.GetContextAMD64()->r15; 182 } else if (register_name == "rip") { 183 value = context.GetContextAMD64()->rip; 184 } else { 185 BPLOG(ERROR) << "Unsupported register: " << register_name; 186 return false; 187 } 188 189 return true; 190 } 191 192 // Lookup the value of `register_name` in `context`, store it into `value` on 193 // success. 194 // Support for non-full-size registers not implemented, since we're only using 195 // this to evaluate address expressions. 196 bool GetRegisterValue(const DumpContext& context, string register_name, 197 uint64_t& value) { 198 if (context.GetContextCPU() == MD_CONTEXT_X86) { 199 return GetRegisterValueX86(context, register_name, value); 200 } else if (context.GetContextCPU() == MD_CONTEXT_AMD64) { 201 return GetRegisterValueAMD64(context, register_name, value); 202 } else { 203 BPLOG(ERROR) << "Unsupported architecture for GetRegisterValue\n"; 204 return false; 205 } 206 } 207 } // namespace 208 209 // static 210 bool DisassemblerObjdump::DisassembleInstruction(uint32_t cpu, 211 const uint8_t* raw_bytes, 212 unsigned int raw_bytes_len, 213 string& instruction) { 214 // Always initialize outputs 215 instruction = ""; 216 217 if (!raw_bytes || raw_bytes_len == 0) { 218 // There's no need to perform any operation in this case, as there's 219 // clearly no instruction there. 220 return false; 221 } 222 223 string architecture; 224 if (cpu == MD_CONTEXT_X86) { 225 architecture = "i386"; 226 } else if (cpu == MD_CONTEXT_AMD64) { 227 architecture = "i386:x86-64"; 228 } else { 229 BPLOG(ERROR) << "Unsupported architecture."; 230 return false; 231 } 232 233 // Create a temporary file for the raw instruction bytes to pass to 234 // objdump, and write the bytes to the input file. 235 ScopedTmpFile raw_bytes_file; 236 if (!raw_bytes_file.InitData(raw_bytes, raw_bytes_len)) { 237 BPLOG(ERROR) << "Failed creating temporary file."; 238 return false; 239 } 240 241 // Create a pipe to use to read the disassembly back from objdump. 242 ScopedPipe disassembly_pipe; 243 if (!disassembly_pipe.Init()) { 244 BPLOG(ERROR) << "Failed creating pipe for output."; 245 return false; 246 } 247 248 pid_t child_pid = fork(); 249 if (child_pid < 0) { 250 BPLOG(ERROR) << "Fork failed."; 251 return false; 252 } 253 254 if (child_pid == 0) { 255 // In the child process, set up the input and output file descriptors. 256 if (dup2(raw_bytes_file.GetFd(), STDIN_FILENO) < 0 || 257 disassembly_pipe.Dup2WriteFd(STDOUT_FILENO) < 0 || 258 disassembly_pipe.Dup2WriteFd(STDERR_FILENO) < 0) { 259 BPLOG(ERROR) << "Failed dup'ing file descriptors."; 260 exit(-1); 261 } 262 263 // We need to close the read end of the pipe in the child process so that 264 // when the parent closes it, the pipe is disconnected. 265 disassembly_pipe.CloseReadFd(); 266 267 // We use "/proc/self/fd/0" here to allow objdump to parse an unnamed file, 268 // since objdump does not have a mode to read from stdin. This cannot be 269 // used with a pipe, since objdump requires that the input is a standard 270 // file. 271 execlp("objdump", "objdump", "-D", "--no-show-raw-insn", "-b", "binary", 272 "-M", "intel", "-m", architecture.c_str(), "/proc/self/fd/0", 273 nullptr); 274 275 BPLOG(ERROR) << "Failed to exec objdump."; 276 exit(-1); 277 } else { 278 // In the parent process, parse the objdump output. 279 280 // Match the instruction line, from: 281 // 0: lock cmpxchg DWORD PTR [esi+0x10],eax 282 // extract the string "lock cmpxchg DWORD PTR [esi+0x10],eax" 283 std::regex instruction_regex( 284 "^\\s+[0-9a-f]+:\\s+" // " 0:" 285 "((?:\\s*\\S*)+)$"); // "lock cmpxchg..." 286 287 std::string line; 288 std::smatch match; 289 while (disassembly_pipe.ReadLine(line)) { 290 if (std::regex_match(line, match, instruction_regex)) { 291 instruction = match[1].str(); 292 break; 293 } 294 } 295 296 // Close the read pipe so that objdump will exit (in case we broke out of 297 // the loop above before reading all of the output). 298 disassembly_pipe.CloseReadFd(); 299 300 // Now wait for objdump to exit. 301 int status = 0; 302 HANDLE_EINTR(waitpid(child_pid, &status, 0)); 303 304 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 305 BPLOG(ERROR) << "objdump didn't run successfully."; 306 return false; 307 } 308 309 if (instruction == "") { 310 BPLOG(ERROR) << "Failed to find instruction in objdump output."; 311 return false; 312 } 313 } 314 315 return true; 316 } 317 318 // static 319 bool DisassemblerObjdump::TokenizeInstruction(const string& instruction, 320 string& operation, string& dest, 321 string& src) { 322 // Always initialize outputs. 323 operation = ""; 324 dest = ""; 325 src = ""; 326 327 // Split the instruction into tokens by either whitespace or comma. 328 std::regex token_regex("((?:[^\\s,]+)|,)(?:\\s)*"); 329 std::sregex_iterator tokens_begin(instruction.begin(), instruction.end(), 330 token_regex); 331 332 bool found_comma = false; 333 for (auto tokens_iter = tokens_begin; tokens_iter != std::sregex_iterator(); 334 ++tokens_iter) { 335 auto token = (*tokens_iter)[1].str(); 336 if (operation.size() == 0) { 337 if (IsInstructionPrefix(token)) 338 continue; 339 operation = token; 340 } else if (dest.size() == 0) { 341 if (IsOperandSize(token)) 342 continue; 343 dest = token; 344 } else if (!found_comma) { 345 if (token == ",") { 346 found_comma = true; 347 } else { 348 BPLOG(ERROR) << "Failed to parse operands from objdump output, expected" 349 " comma but found \"" 350 << token << "\""; 351 return false; 352 } 353 } else if (src.size() == 0) { 354 if (IsOperandSize(token)) 355 continue; 356 src = token; 357 } else { 358 if (token == ",") { 359 BPLOG(ERROR) << "Failed to parse operands from objdump output, found " 360 "unexpected comma after last operand."; 361 return false; 362 } else { 363 // We just ignore other junk after the last operand unless it's a 364 // comma, which would indicate we're probably still in the middle 365 // of the operands and something has gone wrong 366 } 367 } 368 } 369 370 if (found_comma && src.size() == 0) { 371 BPLOG(ERROR) << "Failed to parse operands from objdump output, found comma " 372 "but no src operand."; 373 return false; 374 } 375 376 return true; 377 } 378 379 // static 380 bool DisassemblerObjdump::CalculateAddress(const DumpContext& context, 381 const string& expression, 382 uint64_t& address) { 383 address = 0; 384 385 // Extract the components of the expression. 386 // fs:[esi+edi*4+0x80] -> ["fs", "esi", "edi", "4", "-", "0x80"] 387 std::regex expression_regex( 388 "^(?:(\\ws):)?" // "fs:" 389 "\\[(\\w+)" // "[esi" 390 "(?:\\+(\\w+)(?:\\*(\\d+)))?" // "+edi*4" 391 "(?:([\\+-])(0x[0-9a-f]+))?" // "-0x80" 392 "\\]$"); // "]" 393 394 std::smatch match; 395 if (!std::regex_match(expression, match, expression_regex) || 396 match.size() != 7) { 397 return false; 398 } 399 400 string segment_name = match[1].str(); 401 string register_name = match[2].str(); 402 string index_name = match[3].str(); 403 string index_stride = match[4].str(); 404 string offset_sign = match[5].str(); 405 string offset = match[6].str(); 406 407 uint64_t segment_address = 0; 408 uint64_t register_value = 0; 409 uint64_t index_value = 0; 410 uint64_t index_stride_value = 1; 411 uint64_t offset_value = 0; 412 413 if (segment_name.size() && 414 !GetSegmentAddress(context, segment_name, segment_address)) { 415 return false; 416 } 417 418 if (!GetRegisterValue(context, register_name, register_value)) { 419 return false; 420 } 421 422 if (index_name.size() && 423 !GetRegisterValue(context, index_name, index_value)) { 424 return false; 425 } 426 427 if (index_stride.size()) { 428 index_stride_value = strtoull(index_stride.c_str(), nullptr, 0); 429 } 430 431 if (offset.size()) { 432 offset_value = strtoull(offset.c_str(), nullptr, 0); 433 } 434 435 address = 436 segment_address + register_value + (index_value * index_stride_value); 437 if (offset_sign == "+") { 438 address += offset_value; 439 } else if (offset_sign == "-") { 440 address -= offset_value; 441 } 442 443 return true; 444 } 445 446 DisassemblerObjdump::DisassemblerObjdump(const uint32_t cpu, 447 const MemoryRegion* memory_region, 448 uint64_t address) { 449 if (address < memory_region->GetBase() || 450 memory_region->GetBase() + memory_region->GetSize() <= address) { 451 return; 452 } 453 454 uint8_t ip_bytes[kMaxX86InstructionLength] = {0}; 455 size_t ip_bytes_length; 456 for (ip_bytes_length = 0; ip_bytes_length < kMaxX86InstructionLength; 457 ++ip_bytes_length) { 458 // We have to read byte-by-byte here, since we still want to try and 459 // disassemble an instruction even if we don't have enough bytes. 460 if (!memory_region->GetMemoryAtAddress(address + ip_bytes_length, 461 &ip_bytes[ip_bytes_length])) { 462 break; 463 } 464 } 465 466 string instruction; 467 if (!DisassembleInstruction(cpu, ip_bytes, kMaxX86InstructionLength, 468 instruction)) { 469 return; 470 } 471 472 if (!TokenizeInstruction(instruction, operation_, dest_, src_)) { 473 return; 474 } 475 } 476 477 bool DisassemblerObjdump::CalculateSrcAddress(const DumpContext& context, 478 uint64_t& address) { 479 return CalculateAddress(context, src_, address); 480 } 481 482 bool DisassemblerObjdump::CalculateDestAddress(const DumpContext& context, 483 uint64_t& address) { 484 return CalculateAddress(context, dest_, address); 485 } 486 487 } // namespace google_breakpad