Cradicle Explorer

spread.py
  1  import copy, re
  2  from sets import Set
  3  # NOTE THAT ALL FUNCTIONS HERE WORK BY RETURNING A NEW ARRAY, THERE ARE NO IN-PLACE MODIFICATION METHODS
  4  # ALSO KEEP IN MIND THAT COLUMNS AND ROW INDICES START FROM ZERO (ie. column A -> 0, E -> 4, Z -> 25, etc)
  5  # Scans a CSV line, keeping track of separators and quotes
  6  def scanline(ln,sp=','):
  7    arr = []
  8    inq = False
  9    ind = 0
 10    buff = ""
 11    while ind < len(ln):
 12      if ln[ind] == '"':
 13         inq = not inq
 14         ind += 1
 15      elif ln[ind] == '\\':
 16         buff += ln[ind+1]
 17         ind += 2
 18      elif ln[ind] == sp and not inq:
 19         arr.append(buff)
 20         buff = ""
 21         ind += 1
 22      else:
 23         buff += ln[ind]   
 24         ind += 1
 25    arr.append(buff)
 26    return arr
 27  # Load a CSV into a 2D array, automatically filling in unevenly wide rows to make the array square
 28  def load(f,sp=','):
 29    array = [scanline(x,sp) for x in open(f,'r').readlines()]
 30    maxlen = 0
 31    for i in range(len(array)):
 32      if len(array[i]) > maxlen: maxlen = len(array[i])
 33    for i in range(len(array)):
 34      if len(array[i]) < maxlen: array[i] += [''] * (maxlen - len(array[i]))
 35    return array
 36  # Apply a Porter stemmer to every cell in a given range of cols in an array (calling stem with just a list and no cols argument stems _every_ cell)
 37  # Example outputs: manage, management, manager, managing -> manag; pony, ponies -> poni; reincarnate, reincarnated, reincarnation -> reincarn
 38  def stem(li,cols=0):
 39    if cols == 0: cols = range(len(li[0]))
 40    import porter
 41    pstemmer = porter.PorterStemmer()
 42    newlist = copy.deepcopy(li)
 43    for i in range(len(li)):
 44      for j in cols:
 45        string = str(li[i][j])
 46        for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
 47        words = string.split(' ')
 48        newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words])
 49    return newlist
 50  # Is a string a number?
 51  def isnumber(s):
 52      t = re.findall('^-?[0-9,]*\.[0-9,]*$',s)
 53      return len(t) > 0
 54              
 55  # Declutters (removes special characters, numerifies numbers) every cell, rules same as those for stem(li,cols=0)
 56  def declutter(li,cols=0):
 57    if cols == 0: cols = range(len(li[0]))
 58    newlist = copy.deepcopy(li)
 59    for i in range(len(li)):
 60      for j in cols:
 61        string = str(li[i][j])
 62        for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
 63        words = string.split(' ')
 64        newlist[i][j] = ' '.join([x.strip().lower() for x in words])
 65        if isnumber(newlist[i][j]):
 66          newlist[i][j] = float(newlist[i][j])
 67    return newlist
 68  # Generate a list of individual words occurring in a given column in a given array; useful for generating source lists to do n-grams from
 69  def wordlist(li,col):
 70    wlist = []
 71    for i in range(len(li)):
 72        words = li[i][col].split(' ')
 73        for w in words:
 74          if w not in wlist: wlist.append(w)
 75    return wlist
 76  # Generates a list of phrases (complete cell entries)
 77  def phraselist(li,col):
 78    wlist = []
 79    for i in range(len(li)):
 80       phrase= li[i][col]
 81       if phrase not in wlist: wlist.append(phrase)
 82    return wlist
 83  # Retrieve just a few columns from a given array to make a smaller (narrower) array
 84  def cols(li,cols):
 85    result = []
 86    for i in range(len(li)):
 87      newline = []
 88      for c in cols: 
 89        if c >= 0: newline.append(li[i][c])
 90        else: newline.append(1)
 91      result.append(newline)
 92    return result
 93  # Combine two possibly unsorted arrays matching rows by heading in headingcol1 in li1 and headingcol2 in li2
 94  # setting linclusive = True makes sure every row in li1 makes it into the output, same with rinclusive and li2
 95  # Recommended to do some kind of sort after splice is done
 96  def splice(li1,li2,headingcol1,headingcol2,linclusive=False,rinclusive=False):
 97    s1 = sorted(li1,key=lambda x:x[headingcol1],reverse=True)
 98    s2 = sorted(li2,key=lambda x:x[headingcol2],reverse=True)
 99    l1 = len(s1[0])
100    l2 = len(s2[0])
101    ind1 = 0
102    ind2 = 0
103    output = []
104    while ind1 < len(s1) and ind2 < len(s2):
105      if cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == 1:
106        if rinclusive: output.append([s2[ind2][headingcol2]] + [''] * (l1-1) + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
107        ind2 += 1
108      elif cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == -1:
109        if linclusive: output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * (l2-1))
110        ind1 += 1
111      else:
112        output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
113        ind1, ind2 = ind1 + 1, ind2 + 1
114    while ind1 < len(s1) and linclusive:
115      output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * l2)
116      ind1 += 1
117    while ind2 < len(s2) and rinclusive:
118      output.append([s2[ind2][headingcol2]] + [''] * l1 + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
119      ind2 += 1
120    return output
121  # Creates a wordlist sorted according to function f taken of an array with the results in the addcols in order
122  # eg. sorted_wordlist with addcols = [2,4,6], row is 1 2 4 8 16 32 64, f=lambda x:x[2]+x[1]+1.01*x[0] returns sorting key 84.04
123  def sorted_wordlist(li,wcol,addcols,f=lambda x:x[1],rev=True):
124    return [x[0] for x in sorted(onegrams(li,wcol,addcols),key=f,reverse=rev)]
125  # Utility function, used by twograms, threegrams and fourgrams
126  def compose(arg):
127    return ' '.join(sorted(list(Set(arg))))
128  # Calculate a total sum for every desired column for different exact matches in wcol, column -1 is implied to be 1 for every row
129  # for example, consider the array
130  # dog        20   3
131  # dog house  15   28
132  # cat        25   31
133  # cat        10   7
134  # dog        40   0
135  # house      10   14
136  # Doing pivot(li,0,[1,-1]) gives you the list:
137  # dog        60   2
138  # dog house  15   1
139  # cat        35   2
140  # house      10   1
141  # wlist allows you to restrict the table to a given wordlist
142  def pivot(li, wcol, addcols,wlist=0,sortkey=lambda x:1):
143    if wlist == 0: wlist = phraselist(li,wcol)
144    result = {}
145    for i in range(len(wlist)):
146      result[wlist[i]] = [0] * len(addcols)
147    for i in range(len(li)):
148      nums = []
149      for ac in addcols:
150        if ac >= 0:
151          num = str(li[i][ac]).replace(',','').replace(' ','')
152          if num == '': num = 0
153          elif num[-1] == '%': num = float(num[:-1] * 0.01)
154          else: num = float(num)
155        else: num = 1
156        nums.append(num)
157      if li[i][wcol] in result: result[li[i][wcol]] = [pair[0] + pair[1] for pair in zip(result[li[i][wcol]],nums)]
158    array = []
159    for word in result.keys():
160      array.append([word] + result[word])
161    return sorted(array,key=sortkey,reverse=True)
162  # Similar to a pivot table but looks at individual keywords. The example list above will return with onegrams(li,0,[1,2]):
163  # dog        75   3
164  # cat        35   2
165  # house      25   2
166  def onegrams(li, wcol, addcols,wlist=0,sortkey=lambda x: 1):
167    if wlist == 0: wlist = wordlist(li,wcol) 
168    result = {}
169    for i in range(len(wlist)):
170      result[wlist[i]] = [0] * len(addcols)
171    for i in range(len(li)):
172      words = [x.strip() for x in li[i][wcol].split(' ')]
173      nums = []
174      for ac in addcols:
175        if ac >= 0: 
176          num = str(li[i][ac]).replace(',','').replace(' ','')
177          if num == '': num = 0
178          elif num[-1] == '%': num = float(num[:-1] * 0.01)
179          else: num = float(num)
180        else: num = 1
181        nums.append(num)
182      for i in range(len(words)):
183        if words[i] in result: result[words[i]] = [pair[0] + pair[1] for pair in zip(result[words[i]],nums)]
184    array = []
185    for word in result.keys():
186      array.append([word] + result[word])
187    return sorted(array,key=sortkey,reverse=True)
188  # Calculate a total sum for every column in addcols and for every word pair in wcol
189  # words do not need to be beside each other or in any particular order, so "buy a dog house", "good house for dog owners", "dog in my house" all go under "dog house"
190  def twograms(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
191    if wlist == 0: wlist = wordlist(li,wcol) 
192    result = {}
193    if allindices:
194      for i in range(len(wlist)):
195        for j in range(len(wlist)):
196          if i != j: result[compose([wlist[i],wlist[j]])] = [0] * len(addcols)
197    for i in range(len(li)):
198      if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Two grams: " + str(i) + " / " + str(len(li))
199      words = [x.strip() for x in li[i][wcol].split(' ')]
200      nums = []
201      for ac in addcols:
202        if ac >= 0: 
203          num = str(li[i][ac]).replace(',','').replace(' ','')
204          if num == '': num = 0
205          elif num[-1] == '%': num = float(num[:-1]) * 0.01
206          else: num = float(num)
207        else: num = 1
208        nums.append(num)
209      for i in range(len(words)):
210        if words[i] in wlist:
211          for j in range(i+1,len(words)):
212            if words[j] in wlist:
213              comb = compose([words[i],words[j]])
214              if comb in result: result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
215              elif allindices == False: result[comb] = nums
216    array = []
217    for words in result.keys():
218      array.append([words] + result[words])
219    return sorted(array,key=sortkey,reverse=True)
220  # Calculate a total sum for every column in addcols and for every word triplet in wcol (do not need to be beside each other or in any particular order)
221  # setting allindices to True slows down the calculation a lot but gives you a CSV with all possible combinations of words, making it convenient for
222  # working with the same word list on different data
223  def threegrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
224    if wlist == 0: wlist = wordlist(li,wcol) 
225    result = {}
226    if allindices:
227      for i in range(len(wlist)):
228          for j in range(len(wlist)):
229            for k in range(len(wlist)):
230                if i != j and i != k and j != k: result[compose([wlist[i],wlist[j],wlist[k]])] = [0] * len(addcols)
231    for i in range(len(li)):
232      if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Three grams: " + str(i) + " / " + str(len(li))
233      words = [x.strip() for x in li[i][wcol].split(' ')]
234      nums = []
235      for ac in addcols:
236        if ac >= 0:
237          num = str(li[i][ac]).replace(',','').replace(' ','')
238          if num == '': num = 0
239          elif num[-1] == '%': num = float(num[:-1]) * 0.01
240          else: num = float(num)
241        else: num = 1
242        nums.append(num)
243      for i in range(len(words)):
244        if words[i] in wlist:
245          for j in range(i+1,len(words)):
246            if words[j] in wlist:
247              for k in range(j+1,len(words)):
248                if words[k] in wlist:
249                  comb = compose([words[i],words[j],words[k]])
250                  if comb in result: 
251                    result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
252                  elif allindices == False: result[comb] = nums
253    array = []
254    for words in result.keys():
255      array.append([words] + result[words])
256    return sorted(array,key=sortkey,reverse=True)
257  # Calculate a total sum for every column in addcols and for every word quadruplet in wcol
258  def fourgrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1):
259    if wlist == 0: wlist = wordlist(li,wcol) 
260    result = {}
261    for i in range(len(li)):
262      if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Four grams: " + str(i) + " / " + str(len(li))
263      words = [x.strip() for x in li[i][wcol].split(' ')]
264      nums = []
265      for ac in addcols:
266        if ac >= 0:
267          num = str(li[i][ac]).replace(',','').replace(' ','')
268          if num == '': num = 0
269          elif num[-1] == '%': num = float(num[:-1]) * 0.01
270          else: num = float(num)
271        else: num = 1
272        nums.append(num)
273      for i in range(len(words)):
274        if words[i] in wlist:
275          for j in range(i+1,len(words)):
276            if words[j] in wlist:
277              for k in range(j+1,len(words)):
278                if words[j] in wlist:
279                  for l in range(k+1,len(words)):
280                    if words[l] in wlist:
281                      comb = compose([words[i],words[j],words[k],words[l]])
282                      if comb in result:
283                        result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
284                      else: result[comb] = nums
285                    
286    array = []
287    for words in result.keys():
288      array.append([words] + result[words])
289    return sorted(array,key=sortkey,reverse=True)
290  # Filters array, returning only the rows where column wcol of that row contains the query keywords (keywords can appear in any order)
291  # This and the other filters are useful for taking a list of entries and creating a list of only valid entries according to some validity characteristic
292  # eg:
293  # dog house, 15
294  # cat, 18
295  # dog, 33
296  # filter(li,0,'dog'):
297  # dog house, 15
298  # dog, 33
299  def filter(li,wcol,query):
300    result = []
301    for i in range(len(li)):
302      words = [x.strip() for x in li[i][wcol].split(' ')]
303      inlist = True
304      queryarray = query.split(' ')
305      if queryarray == ['']: queryarray = []
306      for w in queryarray:
307        if w not in words: inlist = False
308      if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
309      if inlist: result.append(li[i])
310    return result
311  # Filters array, requiring column wcol to exactly match query
312  def phrasefilter(li,wcol,query):
313    result = []
314    for i in range(len(li)):
315      if li[i][wcol] == query: result.append(li[i])
316    return result
317  # Filters array, requiring function func taken of the row to return True (or 1)
318  def funcfilter(li,func):
319    result = []
320    for i in range(len(li)):
321      if func(li[i]): result.append(li[i])
322    return result
323  # Adds up columns in addcols for a query matching keyfilter(li,wcol,query); can also be thought of as doing a single n-keyword match
324  # eg:
325  # dog, 25
326  # cat, 15
327  # dog, 75
328  # dog, 10
329  # horse, 55
330  # cat, 7
331  # search(li,0,[1],'dog') gives ['dog',110]
332  def search(li,wcol,addcols,query):
333    result = [0] * len(addcols)
334    for i in range(len(li)):
335      words = [x.strip() for x in li[i][wcol].split(' ')]
336      nums = []
337      for ac in addcols:
338        if ac >= 0:
339          num = str(li[i][ac]).replace(',','').replace(' ','')
340          if num == '': num = 0
341          elif num[-1] == '%': num = float(num[:-1] * 0.01)
342          else: num = float(num)
343        else: num = 1
344        nums.append(num)
345      inlist = True
346      queryarray = query.split(' ')
347      if queryarray == ['']: queryarray = []
348      for w in queryarray:
349        if w not in words: inlist = False
350      if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
351      if inlist:
352        result = [pair[0] + pair[1] for pair in zip(result,nums)]
353    return [query] + result
354  # Print a CSV from an array to stdout
355  def tochars(array,sp=','):
356    string = ""
357    for line in array: string += sp.join([str(x) for x in line]) + '\n'
358    return string[:-1]
359  # Save an array to CSV
360  def save(f,array,sp=','):
361    writeto = open(f,'w')
362    writeto.write(tochars(array,sp))
363    writeto.close()
364  # Compares keywords by two different parameters from two different lists. For example, li1 can be a list of how much money is spent (on addcol1) on a particular combination of keywords (on keycol1) and li2 can be a list of upgraded accounts with the search query they came from on keycol2, and addcol 2 can be left blank to default to -1 (each row is worth one point). Fourth column is statistical significance.
365  # Remember that you may have to filter the list yourself first
366  # Arguments:
367  # grams = 1 for single keywords, 2 for pairs, 3 for triplets and 4 for quadruplets
368  # li1, li2 = your two lists
369  # keycol1, keycol2 = where the keywords are located in those two lists
370  # addcol1, addcol2 = the columns of what you want to add up, eg. cost (set to -1 or leave blank to make it add 1 for each row)
371  # sortkey = function to sort results by (highest first)
372  # usestem = stem keywords
373  # sigtable = add ratio and significance to table
374  # invertratio = set ratio column to col1/col2 instead of col2/col1
375  # preformatted = li1 and li2 are already properly formatted
376  # justpreformat = convert li1 and li2 into twocolumns for comparison but don't go all the way
377  # wordlimit = limit search to some more common keywords for speedup purposes
378  # Example: list of customers, some upgraded, with originating keywords, and a list of how much you're paying for each search phrase
379  #
380  # customers.csv:
381  # Name, Keyword, Status
382  # Bob Jones, spreadsheet csv software, upgraded
383  # Matt Bones, csv python utils, free
384  # Army Drones, free spreadsheet, free
385  # Glenn Mitt, csv software, upgraded
386  # Pat Submitt, python utils software, upgraded
387  # Shawn Wit, python spreadsheet program, upgraded
388  #
389  # costs.csv:
390  # csv software, useless, and, irrelevant, data, 5.00, blah, blah
391  # python spreadsheet, useless, and, irrelevant, data, 2.50, blah, blah
392  # spreadsheet utils, useless, and, irrelevant, data, 10.00, blah, blah
393  # csv utils, useless, and, irrelevant, data, 1.50, blah, blah
394  # 
395  # Steps:
396  # 1. import spread (if not imported already)
397  # 2. upgrades = spread.filter(spread.load('customers.csv'),2,'upgraded')
398  # 3. costs = spread.load('costs.csv')
399  # 4. res = compare(1,costs,upgrades,0,1,5,invertratio=True)
400  # 5. spread.save('saved.csv',res)
401  #
402  # Res should look like:
403  #
404  # Keyword, Column 1, Column 2, Ratio, Significance
405  # spreadsheet, 12.50, 2, 6.25, -0.389
406  # utils, 11.50, 1, 11.50, -0.913
407  # csv, 7.50, 2, 3.75, 0.335
408  # python, 2.50, 2, 1.25, 2.031
409  #
410  # Or, if desired, you can:
411  # i1,i2 = compare(1,costs,upgrades,0,1,5,justpreformat=True)
412  # res1 = compare(1,i1,i2,0,1,5,invertratio=True,preformatted=True)
413  # res2 = compare(2,i1,i2,0,1,5,invertratio=True,preformatted=True)
414  # res3 = compare(3,i1,i2,0,1,5,invertratio=True,preformatted=True)
415  # res4 = compare(4,i1,i2,0,1,5,invertratio=True,preformatted=True)
416  #
417  # Note that significance is calculated based on col2/col1 regardless of invertratio, since getting 0 upgrades when you should have gotten 2 is not that unlikely, but calculating significance based on col1/col2 would give you infinity as infinity is infinitely far away from 0.5.
418  def compare(grams,li1,li2,keycol1,keycol2,addcol1=-1,addcol2=-1,sortkey=lambda x:x[1],usestem=True,sigtable=True,invertratio=False,preformatted=False,justpreformat=False,wordlimit=0):
419    gramfuncs = [0,onegrams,twograms,threegrams,fourgrams]
420    if preformatted == False:
421      s1 = declutter(cols(li1,[keycol1,addcol1]),[1])
422      print "Done decluttering/stemming: 1/4"
423      s2 = declutter(cols(li2,[keycol2,addcol2]),[1])
424      print "Done decluttering/stemming: 2/4"
425      s1 = stem(s1,[0]) if usestem else declutter(s1,[0])
426      print "Done decluttering/stemming: 3/4"
427      s2 = stem(s2,[0]) if usestem else declutter(s2,[0])
428      print "Done decluttering/stemming: 4/4"
429    else: s1,s2 = li1,li2
430    print "Printing sample of list 1"
431    print s1[:10]
432    print "Printing sample of list 2"
433    print s2[:10]
434    if justpreformat: return s1,s2
435    while type(s1[0][1]) is str: s1.pop(0)
436    while type(s2[0][1]) is str: s2.pop(0)
437    print "Cleaned invalid rows"
438    wl = sorted_wordlist(s1,0,[1])
439    if wl.count('') > 0: blank = wl.pop(wl.index(''))
440    print "Base wordlist length: " + str(len(wl)) + " ; Top ten: " + str(wl[:10])
441    if wordlimit > 0 and wordlimit < len(wl):
442      print "Shortening to " + str(wordlimit)
443      wl = wl[:wordlimit]
444    res1 = gramfuncs[grams](s1,0,[1],wl)
445    print "Done search: 1/2"
446    res2 = gramfuncs[grams](s2,0,[1],wl)
447    print "Done search: 2/2"
448    comb = sorted(splice(res1,res2,0,0),key=sortkey,reverse=True)
449    if sigtable:
450      tot1 = search(s1,0,[1],'')
451      tot2 = search(s2,0,[1],'')
452      ev = tot2[1]*1.0/tot1[1]
453      print "Totals: " + str(tot1[1]) + ", " + str(tot2[1])
454      for i in range(len(comb)): 
455        comb[i].append(comb[i][2 - invertratio]*1.0/(comb[i][1 + invertratio] + 0.000001))
456        comb[i].append((comb[i][2] - ev * comb[i][1])*1.0/(ev * comb[i][1] + 0.000001) ** 0.5)
457      comb = [['Keyword','Column 1','Column 2','Ratio','Significance']] + comb
458    else: comb = [['Keyword','Column 1','Column 2']] + comb
459    print "Done"
460    return comb