spread.py
1 import copy, re 2 from sets import Set 3 # NOTE THAT ALL FUNCTIONS HERE WORK BY RETURNING A NEW ARRAY, THERE ARE NO IN-PLACE MODIFICATION METHODS 4 # ALSO KEEP IN MIND THAT COLUMNS AND ROW INDICES START FROM ZERO (ie. column A -> 0, E -> 4, Z -> 25, etc) 5 # Scans a CSV line, keeping track of separators and quotes 6 def scanline(ln,sp=','): 7 arr = [] 8 inq = False 9 ind = 0 10 buff = "" 11 while ind < len(ln): 12 if ln[ind] == '"': 13 inq = not inq 14 ind += 1 15 elif ln[ind] == '\\': 16 buff += ln[ind+1] 17 ind += 2 18 elif ln[ind] == sp and not inq: 19 arr.append(buff) 20 buff = "" 21 ind += 1 22 else: 23 buff += ln[ind] 24 ind += 1 25 arr.append(buff) 26 return arr 27 # Load a CSV into a 2D array, automatically filling in unevenly wide rows to make the array square 28 def load(f,sp=','): 29 array = [scanline(x,sp) for x in open(f,'r').readlines()] 30 maxlen = 0 31 for i in range(len(array)): 32 if len(array[i]) > maxlen: maxlen = len(array[i]) 33 for i in range(len(array)): 34 if len(array[i]) < maxlen: array[i] += [''] * (maxlen - len(array[i])) 35 return array 36 # Apply a Porter stemmer to every cell in a given range of cols in an array (calling stem with just a list and no cols argument stems _every_ cell) 37 # Example outputs: manage, management, manager, managing -> manag; pony, ponies -> poni; reincarnate, reincarnated, reincarnation -> reincarn 38 def stem(li,cols=0): 39 if cols == 0: cols = range(len(li[0])) 40 import porter 41 pstemmer = porter.PorterStemmer() 42 newlist = copy.deepcopy(li) 43 for i in range(len(li)): 44 for j in cols: 45 string = str(li[i][j]) 46 for ch in "'"+'"+[]?!\n': string = string.replace(ch,'') 47 words = string.split(' ') 48 newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words]) 49 return newlist 50 # Is a string a number? 51 def isnumber(s): 52 t = re.findall('^-?[0-9,]*\.[0-9,]*$',s) 53 return len(t) > 0 54 55 # Declutters (removes special characters, numerifies numbers) every cell, rules same as those for stem(li,cols=0) 56 def declutter(li,cols=0): 57 if cols == 0: cols = range(len(li[0])) 58 newlist = copy.deepcopy(li) 59 for i in range(len(li)): 60 for j in cols: 61 string = str(li[i][j]) 62 for ch in "'"+'"+[]?!\n': string = string.replace(ch,'') 63 words = string.split(' ') 64 newlist[i][j] = ' '.join([x.strip().lower() for x in words]) 65 if isnumber(newlist[i][j]): 66 newlist[i][j] = float(newlist[i][j]) 67 return newlist 68 # Generate a list of individual words occurring in a given column in a given array; useful for generating source lists to do n-grams from 69 def wordlist(li,col): 70 wlist = [] 71 for i in range(len(li)): 72 words = li[i][col].split(' ') 73 for w in words: 74 if w not in wlist: wlist.append(w) 75 return wlist 76 # Generates a list of phrases (complete cell entries) 77 def phraselist(li,col): 78 wlist = [] 79 for i in range(len(li)): 80 phrase= li[i][col] 81 if phrase not in wlist: wlist.append(phrase) 82 return wlist 83 # Retrieve just a few columns from a given array to make a smaller (narrower) array 84 def cols(li,cols): 85 result = [] 86 for i in range(len(li)): 87 newline = [] 88 for c in cols: 89 if c >= 0: newline.append(li[i][c]) 90 else: newline.append(1) 91 result.append(newline) 92 return result 93 # Combine two possibly unsorted arrays matching rows by heading in headingcol1 in li1 and headingcol2 in li2 94 # setting linclusive = True makes sure every row in li1 makes it into the output, same with rinclusive and li2 95 # Recommended to do some kind of sort after splice is done 96 def splice(li1,li2,headingcol1,headingcol2,linclusive=False,rinclusive=False): 97 s1 = sorted(li1,key=lambda x:x[headingcol1],reverse=True) 98 s2 = sorted(li2,key=lambda x:x[headingcol2],reverse=True) 99 l1 = len(s1[0]) 100 l2 = len(s2[0]) 101 ind1 = 0 102 ind2 = 0 103 output = [] 104 while ind1 < len(s1) and ind2 < len(s2): 105 if cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == 1: 106 if rinclusive: output.append([s2[ind2][headingcol2]] + [''] * (l1-1) + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:]) 107 ind2 += 1 108 elif cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == -1: 109 if linclusive: output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * (l2-1)) 110 ind1 += 1 111 else: 112 output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:]) 113 ind1, ind2 = ind1 + 1, ind2 + 1 114 while ind1 < len(s1) and linclusive: 115 output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * l2) 116 ind1 += 1 117 while ind2 < len(s2) and rinclusive: 118 output.append([s2[ind2][headingcol2]] + [''] * l1 + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:]) 119 ind2 += 1 120 return output 121 # Creates a wordlist sorted according to function f taken of an array with the results in the addcols in order 122 # eg. sorted_wordlist with addcols = [2,4,6], row is 1 2 4 8 16 32 64, f=lambda x:x[2]+x[1]+1.01*x[0] returns sorting key 84.04 123 def sorted_wordlist(li,wcol,addcols,f=lambda x:x[1],rev=True): 124 return [x[0] for x in sorted(onegrams(li,wcol,addcols),key=f,reverse=rev)] 125 # Utility function, used by twograms, threegrams and fourgrams 126 def compose(arg): 127 return ' '.join(sorted(list(Set(arg)))) 128 # Calculate a total sum for every desired column for different exact matches in wcol, column -1 is implied to be 1 for every row 129 # for example, consider the array 130 # dog 20 3 131 # dog house 15 28 132 # cat 25 31 133 # cat 10 7 134 # dog 40 0 135 # house 10 14 136 # Doing pivot(li,0,[1,-1]) gives you the list: 137 # dog 60 2 138 # dog house 15 1 139 # cat 35 2 140 # house 10 1 141 # wlist allows you to restrict the table to a given wordlist 142 def pivot(li, wcol, addcols,wlist=0,sortkey=lambda x:1): 143 if wlist == 0: wlist = phraselist(li,wcol) 144 result = {} 145 for i in range(len(wlist)): 146 result[wlist[i]] = [0] * len(addcols) 147 for i in range(len(li)): 148 nums = [] 149 for ac in addcols: 150 if ac >= 0: 151 num = str(li[i][ac]).replace(',','').replace(' ','') 152 if num == '': num = 0 153 elif num[-1] == '%': num = float(num[:-1] * 0.01) 154 else: num = float(num) 155 else: num = 1 156 nums.append(num) 157 if li[i][wcol] in result: result[li[i][wcol]] = [pair[0] + pair[1] for pair in zip(result[li[i][wcol]],nums)] 158 array = [] 159 for word in result.keys(): 160 array.append([word] + result[word]) 161 return sorted(array,key=sortkey,reverse=True) 162 # Similar to a pivot table but looks at individual keywords. The example list above will return with onegrams(li,0,[1,2]): 163 # dog 75 3 164 # cat 35 2 165 # house 25 2 166 def onegrams(li, wcol, addcols,wlist=0,sortkey=lambda x: 1): 167 if wlist == 0: wlist = wordlist(li,wcol) 168 result = {} 169 for i in range(len(wlist)): 170 result[wlist[i]] = [0] * len(addcols) 171 for i in range(len(li)): 172 words = [x.strip() for x in li[i][wcol].split(' ')] 173 nums = [] 174 for ac in addcols: 175 if ac >= 0: 176 num = str(li[i][ac]).replace(',','').replace(' ','') 177 if num == '': num = 0 178 elif num[-1] == '%': num = float(num[:-1] * 0.01) 179 else: num = float(num) 180 else: num = 1 181 nums.append(num) 182 for i in range(len(words)): 183 if words[i] in result: result[words[i]] = [pair[0] + pair[1] for pair in zip(result[words[i]],nums)] 184 array = [] 185 for word in result.keys(): 186 array.append([word] + result[word]) 187 return sorted(array,key=sortkey,reverse=True) 188 # Calculate a total sum for every column in addcols and for every word pair in wcol 189 # words do not need to be beside each other or in any particular order, so "buy a dog house", "good house for dog owners", "dog in my house" all go under "dog house" 190 def twograms(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False): 191 if wlist == 0: wlist = wordlist(li,wcol) 192 result = {} 193 if allindices: 194 for i in range(len(wlist)): 195 for j in range(len(wlist)): 196 if i != j: result[compose([wlist[i],wlist[j]])] = [0] * len(addcols) 197 for i in range(len(li)): 198 if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Two grams: " + str(i) + " / " + str(len(li)) 199 words = [x.strip() for x in li[i][wcol].split(' ')] 200 nums = [] 201 for ac in addcols: 202 if ac >= 0: 203 num = str(li[i][ac]).replace(',','').replace(' ','') 204 if num == '': num = 0 205 elif num[-1] == '%': num = float(num[:-1]) * 0.01 206 else: num = float(num) 207 else: num = 1 208 nums.append(num) 209 for i in range(len(words)): 210 if words[i] in wlist: 211 for j in range(i+1,len(words)): 212 if words[j] in wlist: 213 comb = compose([words[i],words[j]]) 214 if comb in result: result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)] 215 elif allindices == False: result[comb] = nums 216 array = [] 217 for words in result.keys(): 218 array.append([words] + result[words]) 219 return sorted(array,key=sortkey,reverse=True) 220 # Calculate a total sum for every column in addcols and for every word triplet in wcol (do not need to be beside each other or in any particular order) 221 # setting allindices to True slows down the calculation a lot but gives you a CSV with all possible combinations of words, making it convenient for 222 # working with the same word list on different data 223 def threegrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False): 224 if wlist == 0: wlist = wordlist(li,wcol) 225 result = {} 226 if allindices: 227 for i in range(len(wlist)): 228 for j in range(len(wlist)): 229 for k in range(len(wlist)): 230 if i != j and i != k and j != k: result[compose([wlist[i],wlist[j],wlist[k]])] = [0] * len(addcols) 231 for i in range(len(li)): 232 if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Three grams: " + str(i) + " / " + str(len(li)) 233 words = [x.strip() for x in li[i][wcol].split(' ')] 234 nums = [] 235 for ac in addcols: 236 if ac >= 0: 237 num = str(li[i][ac]).replace(',','').replace(' ','') 238 if num == '': num = 0 239 elif num[-1] == '%': num = float(num[:-1]) * 0.01 240 else: num = float(num) 241 else: num = 1 242 nums.append(num) 243 for i in range(len(words)): 244 if words[i] in wlist: 245 for j in range(i+1,len(words)): 246 if words[j] in wlist: 247 for k in range(j+1,len(words)): 248 if words[k] in wlist: 249 comb = compose([words[i],words[j],words[k]]) 250 if comb in result: 251 result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)] 252 elif allindices == False: result[comb] = nums 253 array = [] 254 for words in result.keys(): 255 array.append([words] + result[words]) 256 return sorted(array,key=sortkey,reverse=True) 257 # Calculate a total sum for every column in addcols and for every word quadruplet in wcol 258 def fourgrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1): 259 if wlist == 0: wlist = wordlist(li,wcol) 260 result = {} 261 for i in range(len(li)): 262 if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Four grams: " + str(i) + " / " + str(len(li)) 263 words = [x.strip() for x in li[i][wcol].split(' ')] 264 nums = [] 265 for ac in addcols: 266 if ac >= 0: 267 num = str(li[i][ac]).replace(',','').replace(' ','') 268 if num == '': num = 0 269 elif num[-1] == '%': num = float(num[:-1]) * 0.01 270 else: num = float(num) 271 else: num = 1 272 nums.append(num) 273 for i in range(len(words)): 274 if words[i] in wlist: 275 for j in range(i+1,len(words)): 276 if words[j] in wlist: 277 for k in range(j+1,len(words)): 278 if words[j] in wlist: 279 for l in range(k+1,len(words)): 280 if words[l] in wlist: 281 comb = compose([words[i],words[j],words[k],words[l]]) 282 if comb in result: 283 result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)] 284 else: result[comb] = nums 285 286 array = [] 287 for words in result.keys(): 288 array.append([words] + result[words]) 289 return sorted(array,key=sortkey,reverse=True) 290 # Filters array, returning only the rows where column wcol of that row contains the query keywords (keywords can appear in any order) 291 # This and the other filters are useful for taking a list of entries and creating a list of only valid entries according to some validity characteristic 292 # eg: 293 # dog house, 15 294 # cat, 18 295 # dog, 33 296 # filter(li,0,'dog'): 297 # dog house, 15 298 # dog, 33 299 def filter(li,wcol,query): 300 result = [] 301 for i in range(len(li)): 302 words = [x.strip() for x in li[i][wcol].split(' ')] 303 inlist = True 304 queryarray = query.split(' ') 305 if queryarray == ['']: queryarray = [] 306 for w in queryarray: 307 if w not in words: inlist = False 308 if queryarray == ['*']: inlist = len(li[i][wcol]) > 0 309 if inlist: result.append(li[i]) 310 return result 311 # Filters array, requiring column wcol to exactly match query 312 def phrasefilter(li,wcol,query): 313 result = [] 314 for i in range(len(li)): 315 if li[i][wcol] == query: result.append(li[i]) 316 return result 317 # Filters array, requiring function func taken of the row to return True (or 1) 318 def funcfilter(li,func): 319 result = [] 320 for i in range(len(li)): 321 if func(li[i]): result.append(li[i]) 322 return result 323 # Adds up columns in addcols for a query matching keyfilter(li,wcol,query); can also be thought of as doing a single n-keyword match 324 # eg: 325 # dog, 25 326 # cat, 15 327 # dog, 75 328 # dog, 10 329 # horse, 55 330 # cat, 7 331 # search(li,0,[1],'dog') gives ['dog',110] 332 def search(li,wcol,addcols,query): 333 result = [0] * len(addcols) 334 for i in range(len(li)): 335 words = [x.strip() for x in li[i][wcol].split(' ')] 336 nums = [] 337 for ac in addcols: 338 if ac >= 0: 339 num = str(li[i][ac]).replace(',','').replace(' ','') 340 if num == '': num = 0 341 elif num[-1] == '%': num = float(num[:-1] * 0.01) 342 else: num = float(num) 343 else: num = 1 344 nums.append(num) 345 inlist = True 346 queryarray = query.split(' ') 347 if queryarray == ['']: queryarray = [] 348 for w in queryarray: 349 if w not in words: inlist = False 350 if queryarray == ['*']: inlist = len(li[i][wcol]) > 0 351 if inlist: 352 result = [pair[0] + pair[1] for pair in zip(result,nums)] 353 return [query] + result 354 # Print a CSV from an array to stdout 355 def tochars(array,sp=','): 356 string = "" 357 for line in array: string += sp.join([str(x) for x in line]) + '\n' 358 return string[:-1] 359 # Save an array to CSV 360 def save(f,array,sp=','): 361 writeto = open(f,'w') 362 writeto.write(tochars(array,sp)) 363 writeto.close() 364 # Compares keywords by two different parameters from two different lists. For example, li1 can be a list of how much money is spent (on addcol1) on a particular combination of keywords (on keycol1) and li2 can be a list of upgraded accounts with the search query they came from on keycol2, and addcol 2 can be left blank to default to -1 (each row is worth one point). Fourth column is statistical significance. 365 # Remember that you may have to filter the list yourself first 366 # Arguments: 367 # grams = 1 for single keywords, 2 for pairs, 3 for triplets and 4 for quadruplets 368 # li1, li2 = your two lists 369 # keycol1, keycol2 = where the keywords are located in those two lists 370 # addcol1, addcol2 = the columns of what you want to add up, eg. cost (set to -1 or leave blank to make it add 1 for each row) 371 # sortkey = function to sort results by (highest first) 372 # usestem = stem keywords 373 # sigtable = add ratio and significance to table 374 # invertratio = set ratio column to col1/col2 instead of col2/col1 375 # preformatted = li1 and li2 are already properly formatted 376 # justpreformat = convert li1 and li2 into twocolumns for comparison but don't go all the way 377 # wordlimit = limit search to some more common keywords for speedup purposes 378 # Example: list of customers, some upgraded, with originating keywords, and a list of how much you're paying for each search phrase 379 # 380 # customers.csv: 381 # Name, Keyword, Status 382 # Bob Jones, spreadsheet csv software, upgraded 383 # Matt Bones, csv python utils, free 384 # Army Drones, free spreadsheet, free 385 # Glenn Mitt, csv software, upgraded 386 # Pat Submitt, python utils software, upgraded 387 # Shawn Wit, python spreadsheet program, upgraded 388 # 389 # costs.csv: 390 # csv software, useless, and, irrelevant, data, 5.00, blah, blah 391 # python spreadsheet, useless, and, irrelevant, data, 2.50, blah, blah 392 # spreadsheet utils, useless, and, irrelevant, data, 10.00, blah, blah 393 # csv utils, useless, and, irrelevant, data, 1.50, blah, blah 394 # 395 # Steps: 396 # 1. import spread (if not imported already) 397 # 2. upgrades = spread.filter(spread.load('customers.csv'),2,'upgraded') 398 # 3. costs = spread.load('costs.csv') 399 # 4. res = compare(1,costs,upgrades,0,1,5,invertratio=True) 400 # 5. spread.save('saved.csv',res) 401 # 402 # Res should look like: 403 # 404 # Keyword, Column 1, Column 2, Ratio, Significance 405 # spreadsheet, 12.50, 2, 6.25, -0.389 406 # utils, 11.50, 1, 11.50, -0.913 407 # csv, 7.50, 2, 3.75, 0.335 408 # python, 2.50, 2, 1.25, 2.031 409 # 410 # Or, if desired, you can: 411 # i1,i2 = compare(1,costs,upgrades,0,1,5,justpreformat=True) 412 # res1 = compare(1,i1,i2,0,1,5,invertratio=True,preformatted=True) 413 # res2 = compare(2,i1,i2,0,1,5,invertratio=True,preformatted=True) 414 # res3 = compare(3,i1,i2,0,1,5,invertratio=True,preformatted=True) 415 # res4 = compare(4,i1,i2,0,1,5,invertratio=True,preformatted=True) 416 # 417 # Note that significance is calculated based on col2/col1 regardless of invertratio, since getting 0 upgrades when you should have gotten 2 is not that unlikely, but calculating significance based on col1/col2 would give you infinity as infinity is infinitely far away from 0.5. 418 def compare(grams,li1,li2,keycol1,keycol2,addcol1=-1,addcol2=-1,sortkey=lambda x:x[1],usestem=True,sigtable=True,invertratio=False,preformatted=False,justpreformat=False,wordlimit=0): 419 gramfuncs = [0,onegrams,twograms,threegrams,fourgrams] 420 if preformatted == False: 421 s1 = declutter(cols(li1,[keycol1,addcol1]),[1]) 422 print "Done decluttering/stemming: 1/4" 423 s2 = declutter(cols(li2,[keycol2,addcol2]),[1]) 424 print "Done decluttering/stemming: 2/4" 425 s1 = stem(s1,[0]) if usestem else declutter(s1,[0]) 426 print "Done decluttering/stemming: 3/4" 427 s2 = stem(s2,[0]) if usestem else declutter(s2,[0]) 428 print "Done decluttering/stemming: 4/4" 429 else: s1,s2 = li1,li2 430 print "Printing sample of list 1" 431 print s1[:10] 432 print "Printing sample of list 2" 433 print s2[:10] 434 if justpreformat: return s1,s2 435 while type(s1[0][1]) is str: s1.pop(0) 436 while type(s2[0][1]) is str: s2.pop(0) 437 print "Cleaned invalid rows" 438 wl = sorted_wordlist(s1,0,[1]) 439 if wl.count('') > 0: blank = wl.pop(wl.index('')) 440 print "Base wordlist length: " + str(len(wl)) + " ; Top ten: " + str(wl[:10]) 441 if wordlimit > 0 and wordlimit < len(wl): 442 print "Shortening to " + str(wordlimit) 443 wl = wl[:wordlimit] 444 res1 = gramfuncs[grams](s1,0,[1],wl) 445 print "Done search: 1/2" 446 res2 = gramfuncs[grams](s2,0,[1],wl) 447 print "Done search: 2/2" 448 comb = sorted(splice(res1,res2,0,0),key=sortkey,reverse=True) 449 if sigtable: 450 tot1 = search(s1,0,[1],'') 451 tot2 = search(s2,0,[1],'') 452 ev = tot2[1]*1.0/tot1[1] 453 print "Totals: " + str(tot1[1]) + ", " + str(tot2[1]) 454 for i in range(len(comb)): 455 comb[i].append(comb[i][2 - invertratio]*1.0/(comb[i][1 + invertratio] + 0.000001)) 456 comb[i].append((comb[i][2] - ev * comb[i][1])*1.0/(ev * comb[i][1] + 0.000001) ** 0.5) 457 comb = [['Keyword','Column 1','Column 2','Ratio','Significance']] + comb 458 else: comb = [['Keyword','Column 1','Column 2']] + comb 459 print "Done" 460 return comb