/ build-for-compare.py
build-for-compare.py
1 #!/usr/bin/env python3 2 # Written by W.J. van der Laan, provided under MIT license. 3 # 4 # Usage: ../do_build.py <hash> [<hash> ...] 5 # Will produce a ../bitcoind.$1.stripped for binary comparison 6 import os,subprocess,sys,argparse,logging,shutil,re,hashlib,shlex,tempfile 7 from collections import defaultdict 8 from typing import List 9 10 logger = logging.getLogger('do_build') 11 # Use this command to compare resulting directories 12 # git diff -W --word-diff /tmp/compare/4b5b263 /tmp/compare/d1bc5bf 13 14 # WARNING WARNING WARNING 15 # DO NOT RUN this with --nocopy=1 on working tree if you have any local additions. 16 # It will nuke all non-repository files, multiple times over. 17 # WARNING WARNING WARNING 18 19 CONFIGURE_EXTRA=[ 20 'EVENT_CFLAGS=-I/opt/libevent/include', 21 'EVENT_LIBS=-L/opt/libevent/lib -levent', 22 'EVENT_PTHREADS_CFLAGS=-I/opt/libevent/include', 23 'EVENT_PTHREADS_LIBS=-L/opt/libevent/lib -levent_pthreads' 24 ] 25 DEFAULT_PARALLELISM=4 26 DEFAULT_ASSERTIONS=0 27 DEFAULT_NOCOPY=0 28 DEFAULT_PATCH='stripbuildinfo.patch' 29 TMPDIR=tempfile.gettempdir() 30 DEFAULT_TGTDIR=os.path.join(TMPDIR, 'compare') 31 DEFAULT_REPODIR=os.path.join(TMPDIR, 'repo') 32 33 # No debugging information (not used by analysis at the moment, saves on I/O) 34 OPTFLAGS=["-O0","-g0"] 35 # Some options from -O to reduce code size 36 # can't use -O or -Os as it does some weird cross-contamination between unchanged functions in compilation unit 37 # Selectively enable opts that don't interfere or cause excessive sensitivity to changes 38 # 39 OPTFLAGS+=["-fcombine-stack-adjustments","-fcompare-elim","-fcprop-registers","-fdefer-pop","-fforward-propagate","-fif-conversion","-fif-conversion2", 40 "-finline-functions-called-once","-fshrink-wrap","-fsplit-wide-types","-ftree-bit-ccp","-ftree-ccp","-ftree-ch","-ftree-copy-prop","-ftree-copyrename", 41 "-ftree-dce","-ftree-dominator-opts","-ftree-dse","-ftree-fre","-ftree-sink","-ftree-slsr","-ftree-sra","-ftree-ter" 42 ] 43 # 44 # -ffunctions-sections/-fdata-sections put every element in its own section. This is essential. 45 OPTFLAGS+=['-ffunction-sections', '-fdata-sections'] 46 # Fix the random seed 47 OPTFLAGS+=['-frandom-seed=notsorandom'] 48 # OFF: -fmerge-constants don't attempt to merge constants: this causes global interaction between sections/functions 49 # this was reenabled because it doesn't matter, the numbered section names are annoying merged or unmerged 50 OPTFLAGS+=['-fmerge-all-constants'] 51 # -fipa-sra semi-randomly renames functions (or creates variants of functions with different names( 52 OPTFLAGS+=['-fno-ipa-sra'] 53 # -freorder-functions moves functions to .unlikely .hot sections 54 OPTFLAGS+=['-fno-reorder-functions'] 55 # no interprocedural optimizations 56 # -fno-ipa-profile -fno-ipa-pure-const -fno-ipa-reference -fno-guess-branch-probability -fno-ipa-cp 57 58 CPPFLAGS=[] 59 # Prevent __LINE__ from messing with things 60 #CPPFLAGS+=["-D__LINE__=0","-D__DATE__=\"\""] #-D__COUNTER__=0" 61 # XXX unfortunately this approach does not work thanks to boost. 62 63 # objcopy: strip all symbols, debug info, and the hash header section 64 OBJCOPY_ARGS=['-R.note.gnu.build-id','-g','-S'] 65 OBJDUMP_ARGS=['-C','--no-show-raw-insn','-d','-r'] 66 67 # Set QT_RCC_SOURCE_DATE_OVERRIDE so that bitcoin-qt is deterministic 68 os.environ['QT_RCC_SOURCE_DATE_OVERRIDE'] = '1' 69 70 # These can be overridden from the environment 71 GIT=os.getenv('GIT', 'git') 72 MAKE=os.getenv('MAKE', 'make') 73 RSYNC=os.getenv('RSYNC', 'rsync') 74 OBJCOPY=os.getenv('OBJCOPY', 'objcopy') 75 OBJDUMP=os.getenv('OBJDUMP', 'objdump') 76 OBJEXT=os.getenv('OBJEXT', '.o') # object file extension 77 78 PYDIR=os.path.dirname(os.path.abspath(__file__)) 79 PATCHDIR=os.path.join(PYDIR,'patches') 80 81 def init_logging(): 82 LOG_PREFMT = { 83 (logging.DEBUG, '\x1b[38;5;239m[%(name)-8s]\x1b[0m %(message)s\x1b[0m'), 84 (logging.INFO, '\x1b[38;5;19m>\x1b[38;5;18m>\x1b[38;5;17m> \x1b[38;5;239m[%(name)-8s]\x1b[0m %(message)s\x1b[0m'), 85 (logging.WARNING, '\x1b[38;5;228m>\x1b[38;5;227m>\x1b[38;5;226m> \x1b[38;5;239m[%(name)-8s]\x1b[38;5;226m %(message)s\x1b[0m'), 86 (logging.ERROR, '\x1b[38;5;208m>\x1b[38;5;202m>\x1b[38;5;196m> \x1b[38;5;239m[%(name)-8s]\x1b[38;5;196m %(message)s\x1b[0m'), 87 (logging.CRITICAL, '\x1b[48;5;196;38;5;16m>>> [%(name)-8s] %(message)s\x1b[0m'), 88 } 89 90 class MyStreamHandler(logging.StreamHandler): 91 def __init__(self, stream, formatters): 92 logging.StreamHandler.__init__(self, stream) 93 self.formatters = formatters 94 def format(self, record): 95 return self.formatters[record.levelno].format(record) 96 97 formatters = {} 98 for (level, fmtstr) in LOG_PREFMT: 99 formatters[level] = logging.Formatter(fmtstr) 100 handler = MyStreamHandler(sys.stdout, formatters) 101 logging.basicConfig(level=logging.DEBUG, handlers=[handler]) 102 103 def safe_path(path: str) -> bool: 104 ''' 105 Ensure dir is a path we can nuke without consequences. 106 This is currently restricted to /tmp/<anything>. 107 ''' 108 abspath = os.path.abspath(path) 109 if abspath[0] != '/': return False # ??? 110 comps = abspath[1:].split('/') # skip leading slash to avoid relying on empty first component 111 return len(comps) > 1 and abspath.startswith(TMPDIR) 112 113 def shell_split(s: str) -> List[str]: 114 return shlex.split(s) 115 def shell_join(s) -> str: 116 return ' '.join(shlex.quote(x) for x in s) 117 118 def check_call(args) -> int: 119 '''Wrapper for subprocess.check_call that logs what command failed''' 120 try: 121 subprocess.check_call(args) 122 except Exception: 123 logger.error('Command failed: {}'.format(shell_join(args))) 124 raise 125 126 def cmd_exists(cmd) -> bool: 127 '''Determine if a given command is available. Requires "which".''' 128 try: 129 with open(os.devnull, 'w') as FNULL: 130 subprocess.check_call(['which', cmd], stdout=FNULL) 131 except: 132 return False 133 return True 134 135 def iterate_objs(srcdir) -> str: 136 '''Iterate over all object files in srcdir''' 137 for (root, dirs, files) in os.walk(srcdir): 138 if not root.startswith(srcdir): 139 raise ValueError 140 root = root[len(srcdir)+1:] 141 for filename in files: 142 if filename.endswith(OBJEXT): 143 yield os.path.join(root, filename) 144 145 def copy_o_files(srcdir: str, tgtdir: str): 146 '''Copy all object files from srcdir to dstdir, keeping the same directory hierarchy''' 147 for objname in iterate_objs(srcdir): 148 outname = os.path.join(tgtdir, objname) 149 os.makedirs(os.path.dirname(outname), exist_ok=True) 150 shutil.copy(os.path.join(srcdir, objname), outname) 151 152 def objdump_all(srcdir: str, tgtdir: str): 153 ''' 154 Object analysis pass using objdump. 155 ''' 156 for objname in iterate_objs(srcdir): 157 objname = os.path.join(srcdir, objname) 158 p = subprocess.Popen([OBJDUMP] + OBJDUMP_ARGS + [objname], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 159 (out,err) = p.communicate() 160 if p.returncode != 0: 161 raise Exception('objdump failed') 162 (out,err) = (out.decode(),err.decode()) 163 164 # postprocess- break into sections separated by 'Disassembly of section...' 165 sections = defaultdict(list) 166 funcname = '' 167 for line in out.splitlines(): 168 match = re.match('^Disassembly of section (.*):$', line) 169 if match: 170 funcname = match.group(1) 171 if not '.rodata' in line: # filter out 'ebc: R_X86_64_32 .rodata+0x1944' 172 sections[funcname].append(line) 173 174 ''' 175 lines = [] 176 for section in sorted(sections.keys()): # '' header section automatically comes first 177 #lines.extend(sections[section]) 178 lines.append(sections[section][0]) 179 out = '\n'.join(lines) 180 181 outname = os.path.join(tgtdir, objname[:-len(OBJEXT)] + '.dis') 182 make_parent_dirs(outname) 183 with open(outname, 'w') as f: 184 f.write(out) 185 ''' 186 for section in sections.keys(): 187 if not section: 188 continue 189 name = hashlib.sha1(section.encode()).hexdigest() 190 outname = os.path.join(tgtdir, name + '.dis') 191 os.makedirs(os.path.dirname(outname), exist_ok=True) 192 with open(outname, 'w') as f: 193 f.write('\n'.join(sections[section])) 194 195 # some TODO s, learning about the objdump output: 196 # - demangle section names 197 # - remove/make relative addresses 198 # - sort/combine sections 199 # - remove duplicate sections? (sounds like linker's work - can we do a partial link that preserves sections, such as for inlines?) 200 # - resolve callq's relocations - these are ugly right now - integrate reloc result into instruction by substituting argument 201 # - [- 17: R_X86_64_32S vtable for boost::exception_detail::bad_exception_+0x30-] 202 # (at the very least delete callq's arguments) 203 # - for data (mov etc): fill in data? pointers change arbitrarily especially in combined string tables (.rodata.str1...) 204 # and these entries don't have names/symbols 205 # - or could use a different disassembler completely, such as capstone. Parsing objdump output is a hack. 206 207 def parse_arguments(): 208 parser = argparse.ArgumentParser(description='Build to compare binaries. Execute this from a repository directory.') 209 parser.add_argument('commitids', metavar='COMMITID', nargs='+') 210 parser.add_argument('--executables', default='src/bitcoind', help='Comma-separated list of executables to build, default is "src/bitcoind"') 211 parser.add_argument('--tgtdir', default=DEFAULT_TGTDIR, help='Target directory, default is "{}"'.format(DEFAULT_TGTDIR)) 212 parser.add_argument('--repodir', default=DEFAULT_REPODIR, help='Temp repository directory, default is "{}"'.format(DEFAULT_REPODIR)) 213 parser.add_argument('--parallelism', '-j', default=DEFAULT_PARALLELISM, type=int, help='Make parallelism, default is {}'.format(DEFAULT_PARALLELISM)) 214 parser.add_argument('--assertions', default=DEFAULT_ASSERTIONS, type=int, help='Build with assertions, default is {}'.format(DEFAULT_ASSERTIONS)) 215 parser.add_argument('--opt', default=None, type=str, help='Override C/C++ optimization flags. Prepend + to avoid collisions with arguments, e.g. "+-O2 -g"') 216 parser.add_argument('--patches', '-P', default=None, type=str, help='Comma separated list of stripbuildinfo patches to apply, one per hash (in order).') 217 parser.add_argument('--prefix', default=None, type=str, help='A depends prefix that will be passed to configure') 218 parser.add_argument('--nocopy', default=DEFAULT_NOCOPY, type=int, help='Build directly in the repository. If unset, will rsync or copy the repository to a temporary directory first, default is {}'.format(DEFAULT_NOCOPY)) 219 args = parser.parse_args() 220 args.patches = dict(zip(args.commitids, [v.strip() for v in args.patches.split(',')])) if args.patches is not None else {} 221 args.executables = args.executables.split(',') 222 if args.opt is not None: 223 if not args.opt.startswith('+'): 224 print('"opt" argument must start with +', file=sys.stderr) 225 exit(1) 226 args.opt = shell_split(args.opt[1:]) 227 else: 228 args.opt = OPTFLAGS 229 # Safety checks 230 if not args.nocopy and not safe_path(args.repodir): 231 logger.error('Temp repository directory {} may not be used. Please use {}, e.g. "{}/{}"'.format(args.repodir, TMPDIR, TMPDIR, args.repodir)) 232 exit(1) 233 234 return args 235 236 def main(): 237 args = parse_arguments() 238 init_logging() 239 try: 240 try: 241 os.makedirs(args.tgtdir) 242 except FileExistsError: 243 logger.warning("{} already exists, remove it if you don't want to continue a current comparison session".format(args.tgtdir)) 244 if safe_path(args.tgtdir): 245 dodelete = input("Delete {}? [y/n] ".format(args.tgtdir)) 246 if dodelete == 'y' or dodelete == 'Y': 247 # Remove target dir 248 logger.info('Removing {}'.format(args.tgtdir)) 249 check_call(['rm', '-rf', args.tgtdir]) 250 251 for commit in args.commitids: 252 try: 253 int(commit,16) 254 except ValueError: 255 logger.error('{} is not a hexadecimal commit id. It\'s the only thing we know.'.format(commit)) 256 exit(1) 257 258 # Copy repo, unless nocopy is set 259 if not args.nocopy and safe_path(args.repodir): 260 if cmd_exists(RSYNC.split(' ')[0]): 261 logger.info('RSyncing repository ...') 262 check_call([RSYNC, 263 '-r', # recursive 264 '--delete', # delete extraneous files on dst 265 '.git', # from .git in CWD 266 args.repodir]) # to repodir 267 else: 268 gitdir = os.path.join(args.repodir, '.git') 269 logger.warning('Command "rsync" not found; resorting to cp, which tends to be slower.') 270 logger.info('Copying repository ...') 271 # Touch (to avoid file not found) and remove repodir/.git so we don't end up with repodir/.git/.git 272 check_call(['mkdir','-p',args.repodir]) 273 check_call(['touch',gitdir]) 274 check_call(['rm','-rf',gitdir]) 275 check_call(['cp','-r','.git',args.repodir]) 276 # Go to repo 277 os.chdir(args.repodir) 278 279 # Determine (g)make arguments 280 make_args = [] 281 if args.parallelism is not None: 282 make_args += ['-j{}'.format(args.parallelism)] 283 # Disable assertions if requested 284 cppflags = CPPFLAGS 285 if not args.assertions: 286 cppflags+=['-DNDEBUG'] 287 288 for commit in args.commitids: 289 logger.info("Building {}...".format(commit)) 290 stripbuildinfopatch = args.patches[commit] if commit in args.patches else DEFAULT_PATCH 291 commitdir = os.path.join(args.tgtdir, commit) 292 commitdir_obj = os.path.join(args.tgtdir, commit+'.o') 293 294 try: 295 os.makedirs(commitdir) 296 except FileExistsError: 297 logger.error("{} already exists; skipping".format(commitdir)) 298 continue 299 check_call([GIT,'reset','--hard']) 300 check_call([GIT,'clean','-f','-x','-d']) 301 check_call([GIT,'checkout',commit]) 302 try: 303 if commit in args.patches: 304 logger.info('User-defined patch: {}'.format(stripbuildinfopatch)) 305 check_call([GIT,'apply', os.path.join(PATCHDIR,stripbuildinfopatch)]) 306 except subprocess.CalledProcessError: 307 logger.error('Could not apply patch to strip build info. Probably it needs to be updated') 308 exit(1) 309 310 check_call(['./autogen.sh']) 311 logger.info('Running configure script') 312 opt = shell_join(args.opt) 313 check_call(['./configure', '--disable-hardening', '--without-cli', '--disable-tests', '--disable-bench', '--disable-ccache', 314 '--prefix={}'.format(args.prefix) if args.prefix else '--with-incompatible-bdb', 315 'CPPFLAGS='+(' '.join(cppflags)), 316 'CFLAGS='+opt, 'CXXFLAGS='+opt, 'LDFLAGS='+opt] + CONFIGURE_EXTRA) 317 318 for name in args.executables: 319 logger.info('Building executable {}'.format(name)) 320 target_name = os.path.join(args.tgtdir, os.path.basename(name) + '.' + commit) 321 check_call([MAKE] + make_args + [name]) 322 shutil.copy(name, target_name) 323 check_call([OBJCOPY] + OBJCOPY_ARGS + [name, target_name + '.stripped']) 324 325 logger.info('Copying object files...') 326 copy_o_files('.', commitdir_obj) 327 328 logger.info('Performing basic analysis pass...') 329 objdump_all(commitdir_obj, commitdir) 330 331 if len(args.commitids)>1: 332 logger.info('Use these commands to compare results:') 333 logger.info('$ sha256sum {}/*.stripped'.format(args.tgtdir)) 334 logger.info('$ git diff -W --word-diff {} {}'.format(os.path.join(args.tgtdir,args.commitids[0]), os.path.join(args.tgtdir,args.commitids[1]))) 335 except Exception: 336 logger.exception('Error:') 337 338 if __name__ == '__main__': 339 main() 340