/ mkmissing
mkmissing
  1  #!/usr/bin/env python3
  2  # -*- coding: utf-8 -*-
  3  """
  4  find pattern (probably subject id) in files of one step missing in the next
  5  by matching a regular expression against expanded wildcard globs
  6  
  7  Example:
  8     mkmissing -1 'bids/sub-*' -2 'proc/sub-*' # show subj in bids but not proc
  9     # either side can have more than one glob to use
 10     mkmissing -1 'bids/sub-*' -2 'proc_a/sub-*' -2 'proc_b/sub-*'
 11  
 12  Options:
 13      search and replace to normalize: eg. / -> _
 14      -s '(\d{5})/(\d{8})' -r '\1_\2' # specific
 15      -s '/' -r '_'                   # general
 16  
 17      verbose with -v to see contents of inset and outset
 18      as well as their intersection in addition to -1 not in -2
 19  
 20      -p sets the pattern extracted from each file to compare between sets
 21      for BIDS vs fmriprep: -p 'sub-\d+_ses-\d+_task-[^_]*'
 22      for simple ld8:       -p '\d{5}_\d{8}'
 23  """
 24  import re
 25  from glob import glob
 26  
 27  
 28  def patt_in_wildcard(patt, wildcards, searchreplace=None):
 29      """expand wildcards and extract matches"""
 30      if not isinstance(wildcards, list):
 31          wildcards = [wildcards]
 32      ext = [patt.search(x) for w in wildcards for x in glob(w)]
 33      ext = [m.group(0) for m in ext if m is not None]
 34      if searchreplace is not None:
 35          ext = [re.sub(searchreplace[0], searchreplace[1], x) for x in ext]
 36      # set: sort and uniq
 37      return set(ext)
 38  
 39  
 40  def __main__():
 41      from argparse import RawTextHelpFormatter, ArgumentParser
 42      parser = ArgumentParser(description=__doc__,
 43                              formatter_class=RawTextHelpFormatter)
 44      parser.add_argument('-1', '--inglob', dest='in_glob', required=True,
 45                          action='append',
 46                          help="input file wildcard")
 47      parser.add_argument('-2', '--outglob', dest='out_glob', required=True,
 48                          action='append',
 49                          help="step 2 file wildcard")
 50      parser.add_argument('-p', '--pattern', dest='patt',
 51                          help="pattern to match in file names",
 52                          default="\d{5}[-_/]\d{8}")
 53      parser.add_argument('-s', '--search', dest='search',
 54                          help="regexp search. pair with -r/--replace",
 55                          default=None)
 56      parser.add_argument('-r', '--replace', dest='replace',
 57                          help="string replacement. pair with -s/--search",
 58                          default=None)
 59      parser.add_argument('-v', '--verbose', action="store_true")
 60      parser.add_argument('-o', '--saveto', dest='saveto',
 61                          help="file to write if any differences\n" +
 62                          "if not specified, output is printed to terminal",
 63                          default=None)
 64      parser.add_argument('-e', '--save_empty', action='store_true',
 65                          help="save file even if there is no difference\n" +
 66                          "default is false for `make`'s timestamp compare",
 67                          default=None)
 68      args = parser.parse_args()
 69  
 70      p = re.compile(args.patt)
 71  
 72      # setup search and replace if we have both
 73      search_replace = None
 74      if args.replace is None and args.search is not None or \
 75         args.search is None and args.replace is not None:
 76              print("need both -s and -r!")
 77              parser.print_help()
 78              parser.exit()
 79      if args.search is not None:
 80          search_replace = (re.compile(args.search), args.replace)
 81  
 82      # within 'in' set but missing from 'out' set
 83      # order matters: set([1,2,3]) - set([1,2,4]) = {3}
 84      inset = patt_in_wildcard(p, args.in_glob, search_replace)
 85      outset = patt_in_wildcard(p, args.out_glob, search_replace)
 86      missing = inset - outset
 87  
 88      # when debuging pattern, it's useful to see the matches
 89      if args.verbose:
 90          print(f"ARGS: p={p}; search_replace={search_replace}")
 91          print(f"INSET: {len(inset)} in {args.in_glob}")
 92          print(inset)
 93          print(f"OUTSET: {len(outset)} in {args.out_glob}")
 94          print(outset)
 95          print(f"MISSING: {len(missing)}")
 96          print(missing)
 97  
 98          # likely to be exactly inset, but maybe worth checking
 99          both = inset.intersection(outset)
100          print(f"BOTH: {len(both)}")
101          print(both)
102  
103      outstr = "\n".join(missing)
104      # dont write to file if none provided
105      # and only write empty missing to file if save_empty is set
106      if args.saveto is None:
107          print(outstr)
108      elif args.save_empty or missing:
109          with open(args.saveto, 'w') as f:
110              f.write(outstr + "\n")
111  
112  
113  if __name__ == "__main__":
114      __main__()