kitaab.py
1 from . import mistune 2 from poonam import TagHelper 3 from basant.shared import config 4 from jinja2 import Environment, FileSystemLoader 5 import argparse 6 import glob 7 import os 8 import random 9 import platform 10 import re 11 import shutil 12 from datetime import datetime, timezone, timedelta, tzinfo 13 14 # Default encoding 15 encoding = 'utf-8' 16 17 # Activating the renderer 18 renderer = mistune.Renderer() 19 markdown = mistune.Markdown(renderer=renderer) 20 21 22 # Checking for dev command 23 parser = argparse.ArgumentParser() 24 parser.add_argument( 25 "-p", "--prod", help="Prints the supplied argument.", action="store_true") 26 args = parser.parse_args() 27 28 # Declaring build url that will be used in several parts of the app 29 build_url = config.relative_build_url 30 if args.prod: 31 build_url = config.absolute_build_url 32 33 34 def generate_navigation(template_nav=config.template_nav): 35 file_loader = FileSystemLoader('') 36 env = Environment(loader=file_loader) 37 template = env.get_template(template_nav) 38 39 index_tags = config.nav_tags 40 output = template.render(index_urls=[f.split(".")[0] for f in index_tags]) 41 return output 42 43 44 def generate_head(template_head=config.template_head, page_title=config.home_name): 45 file_loader = FileSystemLoader('') 46 env = Environment(loader=file_loader) 47 template = env.get_template(template_head) 48 output = template.render(build_url=build_url, page_title=page_title, name_of_site=config.name_of_site) 49 return output 50 51 52 def add_file_path(pages, root_dir): 53 ret = set() 54 for page in pages: 55 ret.add(root_dir + '/' + page) 56 return ret 57 58 59 def generate_index_pages(template=config.template_main, template_nav=config.template_nav, template_head=config.template_head): 60 template = template 61 all_tags = TagHelper.getAllTags(path=config.files_folder) 62 for index in all_tags.keys(): 63 pages = all_tags[index] 64 pages = add_file_path(pages, config.files_folder) 65 entries = create_entries(pages) 66 subPages = generate_sub_pages(entries, len( 67 entries), config.files_folder, False) 68 # Build the files by filling out the template 69 file_loader = FileSystemLoader('') 70 env = Environment(loader=file_loader) 71 # regular index pagse use main template 72 template = env.get_template(template) 73 page_title = index.capitalize() + ' index' 74 current_index_template = template.render( 75 head=generate_head(template_head, page_title), 76 page_body=subPages, 77 page_date=None, 78 page_tags=None, 79 page_navigation=generate_navigation(template_nav), 80 page_title=page_title, 81 config=config) 82 slugfile = config.build_folder + index + '_index.html' 83 with open(slugfile, 'w', encoding=encoding) as fobj: 84 fobj.write(current_index_template) 85 generate_table_index() 86 87 88 def generate_table_index(template=config.template_index, template_nav=config.template_nav): 89 # build the metaIndex page 90 # This function creates the tag_index.html page 91 file_loader = FileSystemLoader('') 92 env = Environment(loader=file_loader) 93 template = env.get_template(template) 94 all_tags = TagHelper.getAllTags(path=config.files_folder) 95 parsed_set = {} 96 for k, v in all_tags.items(): 97 parsed_set[k] = len(v) 98 current_index_template = template.render(index_urls=parsed_set, 99 page_date=None, 100 page_tags=None, 101 page_navigation=generate_navigation( 102 template_nav), 103 page_title='Piles', 104 head=generate_head(), 105 config=config) 106 slugfile = config.build_folder + 'tag_index.html' 107 with open(slugfile, 'w', encoding=encoding) as fobj: 108 fobj.write(current_index_template) 109 110 111 # Generates html files in the site folder, using the entries and the template 112 def generate_html_pages(site_folder, entries, template, sub_pages_list, template_nav): 113 for entry in entries: 114 # Creating navigation 115 nav_html = generate_navigation(template_nav) 116 # Replaces all occurrences of build_url in the template 117 # (assets, urls, etc) 118 119 file_loader = FileSystemLoader('') 120 env = Environment(loader=file_loader) 121 template = env.get_template(template) 122 if len(entry['tags']) > 1: 123 entry['tags'] = entry['tags'].split(' ') 124 page_template = template.render( 125 head=generate_head(page_title=entry['title']), 126 page_body=entry['pageContent'], 127 page_date="<date>%s</date>" % entry['date'], 128 page_update="<date>%s</date>" % entry['update'], 129 page_tags=entry['tags'], 130 name_of_site="Kitaab", 131 page_navigation=nav_html, 132 page_title=entry['title'], 133 build_url=entry['parent_text'], 134 config=config) 135 136 # Checking if content folder exists 137 folderExists = os.path.exists(site_folder + entry['folder']) 138 # If not, create it 139 if config.flat_build: 140 if not folderExists: 141 os.mkdir(site_folder + entry['folder']) 142 143 # Write the HTML file 144 slug_file = site_folder + entry['slug'] 145 with open(slug_file, 'w', encoding=encoding) as fobj: 146 fobj.write(page_template) 147 148 print("All pages created!") 149 150 151 # Get title by parsing and cleaning the first line of the markdown file 152 def get_entry_title(page): 153 textContent = None 154 try: 155 pageContent = open(page, 'r', errors='ignore') 156 textContent = pageContent.read() 157 textContent = textContent.splitlines() 158 textContent = textContent[0] 159 textContent = textContent.replace('# ', '') 160 if '%title' in textContent: 161 textContent = textContent.replace('%title', '') 162 else: 163 # remove .wiki from the end of the file name 164 textContent = page.split('/')[1][:-5] 165 except FileNotFoundError: 166 # print(f"Page not found, but present in tags: {page}") 167 return None 168 except UnicodeDecodeError: 169 print(f"Page could not be read: {page}") 170 return None 171 except IndexError as e: 172 print(f"{page} probably was'nt grabbed, cannot read file") 173 return textContent 174 175 176 # Get the slug from the markdown file name 177 def get_entry_slug(page): 178 slug = page.split("/")[-1] 179 slug = re.sub('\.wiki$', '', slug) 180 if slug: 181 return slug 182 else: 183 return '' 184 185 186 def style_iframes(page): 187 regex = r"<iframe.+?[\"'].+?><\/iframe>" 188 matches = re.finditer(regex, page, re.MULTILINE) 189 190 for matchNum, match in enumerate(matches, start=1): 191 match = match.group() 192 iframe = "<figure class='youtube'>%s</figure>" % match 193 page = page.replace(match, iframe) 194 return page 195 196 197 # Get a list of the tags in the page 198 def get_tags(page): 199 regex = r":[a-zA-Z:]*:" 200 matches = re.finditer(regex, page, re.MULTILINE) 201 ret = [] 202 203 for match in matches: 204 match = match.group() 205 match = match.rstrip(":") 206 match = match.lstrip(":") 207 match = match.split(":") 208 ret = ret + match 209 ret = " ".join(ret) 210 return ret 211 212 213 # Checks for local images links in markdown and add the build_url and medias_folder url 214 def fix_images_urls(page): 215 regex = r"\!\[.*\]\((.*)\)" 216 matches = re.finditer(regex, page, re.MULTILINE) 217 media_folder = 'media/' # TODO bad hard code 218 219 for matchNum, match in enumerate(matches, start=1): 220 for groupNum in range(0, len(match.groups())): 221 captured_group = match.group(groupNum + 1) 222 if captured_group[:4] != "http": 223 full_url = build_url + media_folder + captured_group 224 page = page.replace(captured_group, full_url) 225 return page 226 227 228 def fix_amp(page): 229 try: 230 page = page.replace(" & ", " & ") 231 return page 232 except AttributeError as e: 233 print(f"{page} is not a string") 234 return "" 235 236 237 def fix_wiki_links(page, path): 238 regex = r"\[\[([\s\S]+?\|?[\s\S]+?)\]\]" 239 matches = re.finditer(regex, page, re.MULTILINE) 240 for matchNum, match in enumerate(matches, start=1): 241 for groupNum in range(0, len(match.groups())): 242 captured_group = match.group(groupNum + 1) 243 link_elem = captured_group.split("|") 244 if len(link_elem) > 1: 245 full_url = "<a href='" + build_url + \ 246 link_elem[0].strip() + ".html' >" + \ 247 link_elem[1].strip() + "</a>" 248 else: 249 if config.flat_build: 250 full_url = "<a href='" + build_url + \ 251 link_elem[0].strip() + ".html' >" + \ 252 link_elem[0].strip() + "</a>" 253 else: 254 full_url = "<a href='" + build_url + path + "/" + \ 255 link_elem[0].strip() + ".html' >" + \ 256 link_elem[0].strip() + "</a>" 257 page = page.replace(match[0], full_url) 258 return page 259 260 261 def remove_tags(page): 262 try: 263 regex = r":[a-zA-Z:]*:" 264 page = re.sub(regex, '', page) 265 return page 266 except: 267 print(f'Unexpected error in removing tags in {page}') 268 return page 269 270 271 def remove_title(page): 272 try: 273 textContent = page.splitlines() 274 title = textContent[0] 275 if '%title' in title: 276 textContent = textContent[1:] 277 return '\n'.join(textContent) 278 except Exception as e: 279 print(f'Unexpected error in removing title in {page}') 280 return page 281 282 283 def remove_date(page): 284 try: 285 regex = r"%date [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}" 286 page = re.sub(regex, '', page) 287 return page 288 except Exception as e: 289 print(f'Unexpected error in removing date in {page}') 290 return page 291 292 def remove_update(page): 293 try: 294 regex = r"%update [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}" 295 page = re.sub(regex, '', page) 296 return page 297 except Exception as e: 298 print(f'Unexpected error in removing date in {page}') 299 return page 300 301 def remove_links(page): 302 try: 303 regex = r"%link *" 304 page = re.sub(regex, '', page) 305 return page 306 except Exception as e: 307 print(f'Unexpected error in removing date in {page}') 308 return page 309 310 311 def replacement_function(replace): 312 replace = replace.group() 313 replace = replace.strip('@<>') 314 if replace == 'name-generation': 315 # TODO gross hardcoded 316 lines = [] 317 # this file should probably be an asset?? 318 with open(config.files_folder + '210124-0033.wiki') as f: 319 lines = f.readlines() 320 lines = lines[-2:] 321 line1 = lines[0].split(',') 322 line2 = lines[1].split(',') 323 line1len = len(line1) 324 line2len = len(line2) 325 first = line1[random.randrange(line1len)].strip('\n') 326 second = line2[random.randrange(line2len)].strip('\n') 327 return first + ' ' + second 328 329 330 def replace_generated(page): 331 # try: 332 page = re.sub(r'@<[A-Za-z- ]*>', 333 replacement_function, 334 page) 335 return page 336 # except Exception as e: 337 # print(e) 338 # print(f"Could not replace generated in {page}") 339 # return page 340 341 342 def get_date(page): 343 try: 344 regex = r"%date [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}" 345 match = re.findall(regex, page)[0].strip('%date') 346 return match 347 except: 348 # print(f"Could not parse date") 349 return None 350 351 def get_update(page): 352 try: 353 regex = r"%update [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}" 354 match = re.findall(regex, page)[0].strip('%update') 355 return match 356 except: 357 # print(f"Could not parse date") 358 return None 359 360 def get_links(page): 361 try: 362 regex = r"%link *" 363 match = re.findall(regex, page)[0].strip('%link') 364 ret = [] 365 for m in match: 366 m = m.group() 367 m = m.strip("%link") 368 ret.append(m) 369 return ret 370 except: 371 # print(f"Could not parse date") 372 return None 373 374 def parse_headings(page): 375 try: 376 page = re.sub(r'= [\[\]a-zA-Z0-9:|+& ]* =', 377 lambda x: x.group().replace('=', '###', 1).strip('='), 378 page) 379 return page 380 except Exception as e: 381 print(f"Could not parse headers {e}") 382 return page 383 384 # From the list of files, creates the main array of entries that will be processed later 385 386 387 def create_entries(pages): 388 fullContent = [] 389 noTitle = [] 390 for page in pages: 391 tempPage = {} 392 393 # Process the page with dedicated functions 394 path = clean_path(page) 395 title = get_entry_title(page) 396 if title is None: 397 noTitle.append(page) 398 continue 399 400 try: 401 markdown_text = open(page, 'r').read() 402 except UnicodeDecodeError: 403 print(f"Couldn't parse {title}, {page}") 404 continue 405 markdown_text = parse_headings(markdown_text) 406 markdown_text = replace_generated(markdown_text) 407 markdown_text = style_iframes(markdown_text) 408 markdown_text = fix_images_urls(markdown_text) 409 markdown_text = fix_wiki_links(markdown_text, path["folder"]) 410 tags = get_tags(markdown_text) 411 date = get_date(markdown_text) 412 update = get_update(markdown_text) 413 links = get_links(markdown_text) 414 markdown_text = remove_tags(markdown_text) 415 markdown_text = remove_title(markdown_text) 416 markdown_text = remove_date(markdown_text) 417 markdown_text = remove_update(markdown_text) 418 markdown_text = remove_links(markdown_text) 419 pageContent = markdown(markdown_text) 420 421 # Create the page object with all the informations we need 422 tempPage['slug'] = path["slug"] 423 tempPage['file'] = path['file'] 424 tempPage['folder'] = path["folder"] 425 tempPage['parent_url'] = path['parent_url'] 426 tempPage['parent_text'] = path['parent_text'] 427 tempPage['iso_date'] = path['date'] 428 if date: 429 tempPage['date'] = date 430 else: 431 tempPage['date'] = path['iso_date'][:-3] 432 if update: 433 tempPage['update'] = update 434 else: 435 tempPage['update'] = tempPage['date'] 436 tempPage['title'] = fix_amp(title) 437 tempPage['pageContent'] = pageContent 438 tempPage['tags'] = tags 439 tempPage['links'] = links 440 441 fullContent.append(tempPage) 442 443 return fullContent 444 445 446 # Copy assets to production folder 447 def move_files(site_folder, path): 448 assets = os.listdir(path) 449 if 'basant/' in path: 450 to_copy_to = path.split("basant/")[1] 451 else: 452 to_copy_to = path 453 if assets: 454 for asset in assets: 455 asset = os.path.join(path, asset) 456 if os.path.isfile(asset): 457 shutil.copy(asset, site_folder + to_copy_to) 458 else: 459 print("No assets found!") 460 461 462 # Transforms the file locations to an array of strings 463 def clean_path(path): 464 path_clean = re.sub('\.wiki$', '', path) 465 items = [] 466 if(platform.system() == 'Windows'): 467 items = path_clean.split('\\') 468 else: 469 items = path_clean.split('/') 470 # TODO ugly hack to handle poonam outputting public, but we want ./ 471 print(items) 472 if items[0] == 'public': 473 items[0] = './' 474 475 path_items = { 476 "slug": None, 477 "date": None, 478 "folder": items[0], 479 "file": items[len(items) - 1] 480 } 481 482 # xx-xx-xxxx = xx-xx-xxxx.html 483 # xx-xx-xxxx-string or string = string.html or /folder/string.html 484 485 # Checking if the file has a date 486 regex = r"[0-9]{6}-[0-9]{4}" 487 match = re.match(regex, path_items["file"]) 488 has_date = False 489 490 if match: 491 has_date = True 492 else: 493 pass 494 # print(f"{path} does not have a date") 495 496 if has_date: 497 if match[0] != path_items["file"]: 498 path_items["date"] = match[0] 499 path_items["slug"] = path_items["file"].replace( 500 match[0] + "-", "") + ".html" 501 else: 502 path_items["date"] = match[0] 503 path_items["slug"] = path_items["file"] + ".html" 504 505 # Converts the EU date to US date to allow page sorting 506 if config.date_format == "EU": 507 path_items["iso_date"] = str( 508 datetime.strptime(path_items["date"], '%y%m%d-%H%M')) 509 if config.date_format == "ISO": 510 path_items["iso_date"] = str( 511 datetime.strptime(path_items["date"], '%Y-%m-%d')) 512 else: 513 path_items["slug"] = path_items["file"] + ".html" 514 # last_edit = str(subprocess.check_output('git log -1 --format="%ci" ' + path, shell=True)).replace("b'", "").replace("\\n'", '') 515 # get last edited time from system 516 try: 517 last_edit = os.path.getmtime('/home/anish/' + path) 518 last_edit_iso = datetime.fromtimestamp(last_edit) 519 except ValueError: 520 last_edit_iso = datetime.fromtimestamp(0) 521 except FileNotFoundError: 522 last_edit_iso = datetime.fromtimestamp(0) 523 524 if config.date_format == "EU": 525 path_items["date"] = str(last_edit_iso.strftime("%d %b, %y")) 526 else: 527 path_items["date"] = str(last_edit_iso) 528 529 path_items["iso_date"] = str(last_edit_iso) 530 531 if config.flat_build: 532 path_items["slug"] = path_items["folder"] + "/" + path_items["slug"] 533 534 # TODO weird special case 535 if "index" == path_items["file"]: 536 path_items["parent_url"] = "" 537 path_items["parent_text"] = config.home_name 538 path_items["slug"] = "created_index" + ".html" 539 else: 540 if config.flat_build: 541 path_items["parent_url"] = path_items["folder"] + ".html" 542 path_items["parent_text"] = path_items["folder"].replace( 543 "-", " ").capitalize() 544 else: 545 path_items["parent_url"] = path_items["folder"] 546 path_items["parent_text"] = path_items["folder"].replace( 547 "-", " ").capitalize() 548 549 return path_items 550 551 552 # Generate the list of sub pages for each section 553 def generate_sub_pages(entries, num, folder, title, title_name=""): 554 # Sort entries by date using the iso_date format 555 entries.sort(key=lambda x: x["iso_date"], reverse=True) 556 # Take n number of entries (5 for the home, all for the sub-section pages) 557 selected_entries = entries[:num] 558 # Create the list 559 file_loader = FileSystemLoader('') 560 env = Environment(loader=file_loader) 561 template = env.get_template(config.template_list) 562 sub_page_list = template.render( 563 entries=selected_entries, 564 title_name=title_name, 565 title=title) 566 567 return sub_page_list 568 569 570 # Creates the home page using home.md 571 def create_home_page(template, site_folder): 572 573 # Read the file and add "content_list" as a future replacement point for sub page listing 574 html = "content_list" 575 576 # Replace template strings with content 577 template = template.replace('page_title', config.home_name) 578 template = template.replace('page_body', html) 579 template = template.replace('build_url', build_url) 580 template = template.replace('page_navigation', "") 581 template = template.replace('footer', "") 582 583 return template 584 585 586 def create_rss_feed(rss_entries, rss_template, rss_item_template, site_folder): 587 template = open(rss_template, 'r').read() 588 itemTemplate = open(rss_item_template, 'r').read() 589 rss_entries.sort(key=lambda x: x["iso_date"], reverse=True) 590 591 rss_items = "" 592 for rss_entry in rss_entries[:25]: 593 entry_template = itemTemplate 594 entry_template = entry_template.replace( 595 'rssItemTitle', rss_entry["title"]) 596 entry_template = entry_template.replace('rssItemUrl', build_url 597 + rss_entry["slug"]) 598 date = datetime.strptime(rss_entry["iso_date"], "%y%m%d-%H%M") 599 date = date.strftime("%a, %d %b %Y %H:%M") 600 entry_template = entry_template.replace( 601 'rssItemDate', date) 602 entry_template = entry_template.replace( 603 'rssItemContent', rss_entry["pageContent"]) 604 rss_items += entry_template 605 606 template = template.replace('name_of_site', config.name_of_site) 607 template = template.replace('site_meta_description', 608 config.site_meta_description) 609 template = template.replace('build_url', build_url) 610 template = template.replace('date_build', str( 611 datetime.now().date())) 612 template = template.replace('rss_content', rss_items) 613 614 slug_file = site_folder + "feed.xml" 615 with open(slug_file, 'w', encoding=encoding) as fobj: 616 fobj.write(template) 617 618 return 619 620 621 def generate_website(): 622 # If build folder exists delete it 623 if os.path.exists(config.build_folder): 624 shutil.rmtree(config.build_folder) 625 626 # Make new folders 627 os.makedirs(config.build_folder + 'assets/') 628 os.makedirs(config.build_folder + 'media/') 629 630 # Get main html template 631 file_loader = FileSystemLoader('') 632 env = Environment(loader=file_loader) 633 home_page = env.get_template(config.template_main) 634 635 generate_index_pages(config.template_main, config.template_nav) 636 637 rss_entries = [] 638 home_pageSubList = [] 639 640 pages = glob.glob(config.files_folder + '**/*.wiki', recursive=True) 641 entries = create_entries(pages) 642 sub_pages_list = generate_sub_pages( 643 entries, len(entries), config.files_folder, False) 644 645 # For each section, create a short listing of sub pages 646 # and add it to the home page 647 home_pageSubList = generate_sub_pages(entries, 50, config.files_folder, False) 648 649 generate_html_pages(config.build_folder, entries, config.template_main, 650 sub_pages_list, config.template_nav) 651 652 for entry in entries: 653 rss_entries.append(entry) 654 655 # Move the assets 656 move_files(config.build_folder, config.assets_folder) 657 move_files(config.build_folder, config.medias_folder) 658 659 # Once all sections have been processed, finish the home page 660 home_page = home_page.render( 661 head=generate_head(page_title="Recent Files"), 662 page_body=home_pageSubList, 663 page_date=None, 664 page_tags=None, 665 page_navigation=generate_navigation(), 666 page_title="Recent Files", 667 config=config) 668 669 slug_file = config.build_folder + "recents.html" 670 with open(slug_file, 'w', encoding=encoding) as fobj: 671 fobj.write(home_page) 672 673 # Create RSS File 674 create_rss_feed(rss_entries, config.rss_template, 675 config.rss_item_template, config.build_folder) 676 677 678 # Triggers the website build 679 if __name__ == '__main__': 680 generate_website()