/ build-for-compare.py
build-for-compare.py
  1  #!/usr/bin/env python3
  2  # Written by W.J. van der Laan, provided under MIT license.
  3  #
  4  # Usage: ../do_build.py <hash> [<hash> ...]
  5  # Will produce a ../bitcoind.$1.stripped for binary comparison
  6  import os,subprocess,sys,argparse,logging,shutil,re,hashlib,shlex,tempfile
  7  from collections import defaultdict
  8  from typing import List
  9  
 10  logger = logging.getLogger('do_build')
 11  # Use this command to compare resulting directories
 12  # git diff -W --word-diff /tmp/compare/4b5b263 /tmp/compare/d1bc5bf
 13  
 14  # WARNING WARNING WARNING
 15  #   DO NOT RUN this with --nocopy=1 on working tree if you have any local additions.
 16  #   It will nuke all non-repository files, multiple times over.
 17  # WARNING WARNING WARNING
 18  
 19  CONFIGURE_EXTRA=[
 20  'EVENT_CFLAGS=-I/opt/libevent/include',
 21  'EVENT_LIBS=-L/opt/libevent/lib -levent',
 22  'EVENT_PTHREADS_CFLAGS=-I/opt/libevent/include',
 23  'EVENT_PTHREADS_LIBS=-L/opt/libevent/lib -levent_pthreads'
 24  ]
 25  DEFAULT_PARALLELISM=4
 26  DEFAULT_ASSERTIONS=0
 27  DEFAULT_NOCOPY=0
 28  DEFAULT_PATCH='stripbuildinfo.patch'
 29  TMPDIR=tempfile.gettempdir()
 30  DEFAULT_TGTDIR=os.path.join(TMPDIR, 'compare')
 31  DEFAULT_REPODIR=os.path.join(TMPDIR, 'repo')
 32  
 33  # No debugging information (not used by analysis at the moment, saves on I/O)
 34  OPTFLAGS=["-O0","-g0"]
 35  # Some options from -O to reduce code size
 36  # can't use -O or -Os as it does some weird cross-contamination between unchanged functions in compilation unit
 37  # Selectively enable opts that don't interfere or cause excessive sensitivity to changes
 38  #
 39  OPTFLAGS+=["-fcombine-stack-adjustments","-fcompare-elim","-fcprop-registers","-fdefer-pop","-fforward-propagate","-fif-conversion","-fif-conversion2",
 40          "-finline-functions-called-once","-fshrink-wrap","-fsplit-wide-types","-ftree-bit-ccp","-ftree-ccp","-ftree-ch","-ftree-copy-prop","-ftree-copyrename",
 41          "-ftree-dce","-ftree-dominator-opts","-ftree-dse","-ftree-fre","-ftree-sink","-ftree-slsr","-ftree-sra","-ftree-ter"
 42  ]
 43  #
 44  # -ffunctions-sections/-fdata-sections put every element in its own section. This is essential.
 45  OPTFLAGS+=['-ffunction-sections', '-fdata-sections']
 46  # Fix the random seed
 47  OPTFLAGS+=['-frandom-seed=notsorandom']
 48  # OFF: -fmerge-constants don't attempt to merge constants: this causes global interaction between sections/functions
 49  # this was reenabled because it doesn't matter, the numbered section names are annoying merged or unmerged
 50  OPTFLAGS+=['-fmerge-all-constants']
 51  # -fipa-sra semi-randomly renames functions (or creates variants of functions with different names(
 52  OPTFLAGS+=['-fno-ipa-sra']
 53  # -freorder-functions moves functions to .unlikely .hot sections
 54  OPTFLAGS+=['-fno-reorder-functions']
 55  # no interprocedural optimizations
 56  # -fno-ipa-profile -fno-ipa-pure-const -fno-ipa-reference -fno-guess-branch-probability -fno-ipa-cp
 57  
 58  CPPFLAGS=[]
 59  # Prevent __LINE__ from messing with things
 60  #CPPFLAGS+=["-D__LINE__=0","-D__DATE__=\"\""] #-D__COUNTER__=0"
 61  # XXX unfortunately this approach does not work thanks to boost.
 62  
 63  # objcopy: strip all symbols, debug info, and the hash header section
 64  OBJCOPY_ARGS=['-R.note.gnu.build-id','-g','-S']
 65  OBJDUMP_ARGS=['-C','--no-show-raw-insn','-d','-r']
 66  
 67  # Set QT_RCC_SOURCE_DATE_OVERRIDE so that bitcoin-qt is deterministic
 68  os.environ['QT_RCC_SOURCE_DATE_OVERRIDE'] = '1'
 69  
 70  # These can be overridden from the environment
 71  GIT=os.getenv('GIT', 'git')
 72  MAKE=os.getenv('MAKE', 'make')
 73  RSYNC=os.getenv('RSYNC', 'rsync')
 74  OBJCOPY=os.getenv('OBJCOPY', 'objcopy')
 75  OBJDUMP=os.getenv('OBJDUMP', 'objdump')
 76  OBJEXT=os.getenv('OBJEXT', '.o') # object file extension
 77  
 78  PYDIR=os.path.dirname(os.path.abspath(__file__))
 79  PATCHDIR=os.path.join(PYDIR,'patches')
 80  
 81  def init_logging():
 82      LOG_PREFMT = {
 83          (logging.DEBUG, '\x1b[38;5;239m[%(name)-8s]\x1b[0m %(message)s\x1b[0m'),
 84          (logging.INFO,  '\x1b[38;5;19m>\x1b[38;5;18m>\x1b[38;5;17m> \x1b[38;5;239m[%(name)-8s]\x1b[0m %(message)s\x1b[0m'),
 85          (logging.WARNING, '\x1b[38;5;228m>\x1b[38;5;227m>\x1b[38;5;226m> \x1b[38;5;239m[%(name)-8s]\x1b[38;5;226m %(message)s\x1b[0m'),
 86          (logging.ERROR, '\x1b[38;5;208m>\x1b[38;5;202m>\x1b[38;5;196m> \x1b[38;5;239m[%(name)-8s]\x1b[38;5;196m %(message)s\x1b[0m'),
 87          (logging.CRITICAL, '\x1b[48;5;196;38;5;16m>>> [%(name)-8s] %(message)s\x1b[0m'),
 88      }
 89  
 90      class MyStreamHandler(logging.StreamHandler):
 91          def __init__(self, stream, formatters):
 92              logging.StreamHandler.__init__(self, stream)
 93              self.formatters = formatters
 94          def format(self, record):
 95              return self.formatters[record.levelno].format(record)
 96  
 97      formatters = {}
 98      for (level, fmtstr) in LOG_PREFMT:
 99          formatters[level] = logging.Formatter(fmtstr)
100      handler = MyStreamHandler(sys.stdout, formatters)
101      logging.basicConfig(level=logging.DEBUG, handlers=[handler])
102  
103  def safe_path(path: str) -> bool:
104      '''
105      Ensure dir is a path we can nuke without consequences.
106      This is currently restricted to /tmp/<anything>.
107      '''
108      abspath = os.path.abspath(path)
109      if abspath[0] != '/': return False # ???
110      comps = abspath[1:].split('/') # skip leading slash to avoid relying on empty first component
111      return len(comps) > 1 and abspath.startswith(TMPDIR)
112  
113  def shell_split(s: str) -> List[str]:
114      return shlex.split(s)
115  def shell_join(s) -> str:
116      return ' '.join(shlex.quote(x) for x in s)
117  
118  def check_call(args) -> int:
119      '''Wrapper for subprocess.check_call that logs what command failed'''
120      try:
121          subprocess.check_call(args)
122      except Exception:
123          logger.error('Command failed: {}'.format(shell_join(args)))
124          raise
125  
126  def cmd_exists(cmd) -> bool:
127      '''Determine if a given command is available. Requires "which".'''
128      try:
129          with open(os.devnull, 'w') as FNULL:
130              subprocess.check_call(['which', cmd], stdout=FNULL)
131      except:
132          return False
133      return True
134  
135  def iterate_objs(srcdir) -> str:
136      '''Iterate over all object files in srcdir'''
137      for (root, dirs, files) in os.walk(srcdir):
138          if not root.startswith(srcdir):
139              raise ValueError
140          root = root[len(srcdir)+1:]
141          for filename in files:
142              if filename.endswith(OBJEXT):
143                  yield os.path.join(root, filename)
144  
145  def copy_o_files(srcdir: str, tgtdir: str):
146      '''Copy all object files from srcdir to dstdir, keeping the same directory hierarchy'''
147      for objname in iterate_objs(srcdir):
148          outname = os.path.join(tgtdir, objname)
149          os.makedirs(os.path.dirname(outname), exist_ok=True)
150          shutil.copy(os.path.join(srcdir, objname), outname)
151  
152  def objdump_all(srcdir: str, tgtdir: str):
153      '''
154      Object analysis pass using objdump.
155      '''
156      for objname in iterate_objs(srcdir):
157          objname = os.path.join(srcdir, objname)
158          p = subprocess.Popen([OBJDUMP] + OBJDUMP_ARGS + [objname], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
159          (out,err) = p.communicate()
160          if p.returncode != 0:
161              raise Exception('objdump failed')
162          (out,err) = (out.decode(),err.decode())
163  
164          # postprocess- break into sections separated by 'Disassembly of section...'
165          sections = defaultdict(list)
166          funcname = ''
167          for line in out.splitlines():
168              match = re.match('^Disassembly of section (.*):$', line)
169              if match:
170                  funcname = match.group(1)
171              if not '.rodata' in line:  # filter out 'ebc: R_X86_64_32        .rodata+0x1944'
172                  sections[funcname].append(line)
173  
174          '''
175          lines = []
176          for section in sorted(sections.keys()): # '' header section automatically comes first
177              #lines.extend(sections[section])
178              lines.append(sections[section][0])
179          out = '\n'.join(lines)
180  
181          outname = os.path.join(tgtdir, objname[:-len(OBJEXT)] + '.dis')
182          make_parent_dirs(outname)
183          with open(outname, 'w') as f:
184              f.write(out)
185          '''
186          for section in sections.keys():
187              if not section:
188                  continue
189              name = hashlib.sha1(section.encode()).hexdigest()
190              outname = os.path.join(tgtdir, name + '.dis')
191              os.makedirs(os.path.dirname(outname), exist_ok=True)
192              with open(outname, 'w') as f:
193                  f.write('\n'.join(sections[section]))
194  
195      # some TODO s, learning about the objdump output:
196      # - demangle section names
197      # - remove/make relative addresses
198      # - sort/combine sections
199      # - remove duplicate sections? (sounds like linker's work - can we do a partial link that preserves sections, such as for inlines?)
200      # - resolve callq's relocations - these are ugly right now - integrate reloc result into instruction by substituting argument
201      #    - [-  17: R_X86_64_32S        vtable for boost::exception_detail::bad_exception_+0x30-]
202      #    (at the very least delete callq's arguments)
203      # - for data (mov etc): fill in data? pointers change arbitrarily especially in combined string tables (.rodata.str1...)
204      #       and these entries don't have names/symbols
205      # - or could use a different disassembler completely, such as capstone. Parsing objdump output is a hack.
206  
207  def parse_arguments():
208      parser = argparse.ArgumentParser(description='Build to compare binaries. Execute this from a repository directory.')
209      parser.add_argument('commitids', metavar='COMMITID', nargs='+')
210      parser.add_argument('--executables', default='src/bitcoind', help='Comma-separated list of executables to build, default is "src/bitcoind"')
211      parser.add_argument('--tgtdir', default=DEFAULT_TGTDIR, help='Target directory, default is "{}"'.format(DEFAULT_TGTDIR))
212      parser.add_argument('--repodir', default=DEFAULT_REPODIR, help='Temp repository directory, default is "{}"'.format(DEFAULT_REPODIR))
213      parser.add_argument('--parallelism', '-j', default=DEFAULT_PARALLELISM, type=int, help='Make parallelism, default is {}'.format(DEFAULT_PARALLELISM))
214      parser.add_argument('--assertions', default=DEFAULT_ASSERTIONS, type=int, help='Build with assertions, default is {}'.format(DEFAULT_ASSERTIONS))
215      parser.add_argument('--opt', default=None, type=str, help='Override C/C++ optimization flags. Prepend + to avoid collisions with arguments, e.g. "+-O2 -g"')
216      parser.add_argument('--patches', '-P', default=None, type=str, help='Comma separated list of stripbuildinfo patches to apply, one per hash (in order).')
217      parser.add_argument('--prefix', default=None, type=str, help='A depends prefix that will be passed to configure')
218      parser.add_argument('--nocopy', default=DEFAULT_NOCOPY, type=int, help='Build directly in the repository. If unset, will rsync or copy the repository to a temporary directory first, default is {}'.format(DEFAULT_NOCOPY))
219      args = parser.parse_args()
220      args.patches = dict(zip(args.commitids, [v.strip() for v in args.patches.split(',')])) if args.patches is not None else {}
221      args.executables = args.executables.split(',')
222      if args.opt is not None:
223          if not args.opt.startswith('+'):
224              print('"opt" argument must start with +', file=sys.stderr)
225              exit(1)
226          args.opt = shell_split(args.opt[1:])
227      else:
228          args.opt = OPTFLAGS
229      # Safety checks
230      if not args.nocopy and not safe_path(args.repodir):
231          logger.error('Temp repository directory {} may not be used. Please use {}, e.g. "{}/{}"'.format(args.repodir, TMPDIR, TMPDIR, args.repodir))
232          exit(1)
233  
234      return args
235  
236  def main():
237      args = parse_arguments()
238      init_logging()
239      try:
240          try:
241              os.makedirs(args.tgtdir)
242          except FileExistsError:
243              logger.warning("{} already exists, remove it if you don't want to continue a current comparison session".format(args.tgtdir))
244              if safe_path(args.tgtdir):
245                  dodelete = input("Delete {}? [y/n] ".format(args.tgtdir))
246                  if dodelete == 'y' or dodelete == 'Y':
247                      # Remove target dir
248                      logger.info('Removing {}'.format(args.tgtdir))
249                      check_call(['rm', '-rf', args.tgtdir])
250  
251          for commit in args.commitids:
252              try:
253                  int(commit,16)
254              except ValueError:
255                  logger.error('{} is not a hexadecimal commit id. It\'s the only thing we know.'.format(commit))
256                  exit(1)
257  
258          # Copy repo, unless nocopy is set
259          if not args.nocopy and safe_path(args.repodir):
260              if cmd_exists(RSYNC.split(' ')[0]):
261                  logger.info('RSyncing repository ...')
262                  check_call([RSYNC,
263                      '-r',           # recursive
264                      '--delete',     # delete extraneous files on dst
265                      '.git',         # from .git in CWD
266                      args.repodir])  # to repodir
267              else:
268                  gitdir = os.path.join(args.repodir, '.git')
269                  logger.warning('Command "rsync" not found; resorting to cp, which tends to be slower.')
270                  logger.info('Copying repository ...')
271                  # Touch (to avoid file not found) and remove repodir/.git so we don't end up with repodir/.git/.git
272                  check_call(['mkdir','-p',args.repodir])
273                  check_call(['touch',gitdir])
274                  check_call(['rm','-rf',gitdir])
275                  check_call(['cp','-r','.git',args.repodir])
276              # Go to repo
277              os.chdir(args.repodir)
278  
279          # Determine (g)make arguments
280          make_args = []
281          if args.parallelism is not None:
282              make_args += ['-j{}'.format(args.parallelism)]
283          # Disable assertions if requested
284          cppflags = CPPFLAGS
285          if not args.assertions:
286              cppflags+=['-DNDEBUG']
287  
288          for commit in args.commitids:
289              logger.info("Building {}...".format(commit))
290              stripbuildinfopatch = args.patches[commit] if commit in args.patches else DEFAULT_PATCH
291              commitdir = os.path.join(args.tgtdir, commit)
292              commitdir_obj = os.path.join(args.tgtdir, commit+'.o')
293  
294              try:
295                  os.makedirs(commitdir)
296              except FileExistsError:
297                  logger.error("{} already exists; skipping".format(commitdir))
298                  continue
299              check_call([GIT,'reset','--hard'])
300              check_call([GIT,'clean','-f','-x','-d'])
301              check_call([GIT,'checkout',commit])
302              try:
303                  if commit in args.patches:
304                      logger.info('User-defined patch: {}'.format(stripbuildinfopatch))
305                  check_call([GIT,'apply', os.path.join(PATCHDIR,stripbuildinfopatch)])
306              except subprocess.CalledProcessError:
307                  logger.error('Could not apply patch to strip build info. Probably it needs to be updated')
308                  exit(1)
309  
310              check_call(['./autogen.sh'])
311              logger.info('Running configure script')
312              opt = shell_join(args.opt)
313              check_call(['./configure', '--disable-hardening', '--without-cli', '--disable-tests', '--disable-bench', '--disable-ccache',
314                  '--prefix={}'.format(args.prefix) if args.prefix else '--with-incompatible-bdb',
315                  'CPPFLAGS='+(' '.join(cppflags)), 
316                  'CFLAGS='+opt, 'CXXFLAGS='+opt, 'LDFLAGS='+opt] + CONFIGURE_EXTRA)
317  
318              for name in args.executables:
319                  logger.info('Building executable {}'.format(name))
320                  target_name = os.path.join(args.tgtdir, os.path.basename(name) + '.' + commit)
321                  check_call([MAKE] + make_args + [name])
322                  shutil.copy(name, target_name)
323                  check_call([OBJCOPY] + OBJCOPY_ARGS + [name, target_name + '.stripped'])
324  
325              logger.info('Copying object files...')
326              copy_o_files('.', commitdir_obj)
327  
328              logger.info('Performing basic analysis pass...')
329              objdump_all(commitdir_obj, commitdir)
330  
331          if len(args.commitids)>1: 
332              logger.info('Use these commands to compare results:')
333              logger.info('$ sha256sum {}/*.stripped'.format(args.tgtdir))
334              logger.info('$ git diff -W --word-diff {} {}'.format(os.path.join(args.tgtdir,args.commitids[0]), os.path.join(args.tgtdir,args.commitids[1])))
335      except Exception:
336          logger.exception('Error:')
337  
338  if __name__ == '__main__':
339      main()
340