mixed_content_parser.rb
1 require 'java' 2 3 module MixedContentParser 4 5 def self.parse(content, base_uri, opts = {} ) 6 opts[:pretty_print] ||= false 7 8 return if content.nil? 9 10 content.strip! 11 content.chomp! 12 13 return '' if content.empty? 14 15 # create an empty document just to get an outputSettings object 16 # (seems like the API falls down when we do this directly...) 17 d = org.jsoup.Jsoup.parse("") 18 d.outputSettings.prettyPrint(opts[:pretty_print]) 19 20 # archon does things differently..... 21 content.gsub!("\n\t", "\n\n") 22 23 # transform blocks of text seperated by line breaks into <p> wrapped blocks 24 content = content.split("\n\n").inject("") { |c, n| c << "<p>#{n}</p>" } if opts[:wrap_blocks] 25 26 whitelist = org.jsoup.safety.Whitelist.relaxed 27 .addTags("emph", "lb", "title", "unitdate") 28 .addAttributes("emph", "render") 29 .addAttributes("title", "render") 30 .addAttributes("unitdate", "render") 31 32 cleaned_content = org.jsoup.Jsoup.clean(content, base_uri, whitelist, d.outputSettings()) 33 34 document = org.jsoup.Jsoup.parse(cleaned_content, base_uri, org.jsoup.parser.Parser.xmlParser()) 35 document.outputSettings.escapeMode(Java::OrgJsoupNodes::Entities::EscapeMode.xhtml) 36 document.outputSettings.prettyPrint(opts[:pretty_print]) 37 38 # replace lb with br 39 document.select("lb").tagName("br") 40 41 # tweak the emph tags 42 [ "emph", "title", "unitdate" ].each do |tag| 43 document.select(tag).each do |emph| 44 # make all emph's a span 45 emph.tagName("span") 46 47 # <emph> should render as <em> if there is no @render attribute. If there is, render as follows: 48 if emph.attr("render").empty? 49 emph.attr("class", "emph render-none") 50 51 # render="nonproport": <code> 52 elsif emph.attr("render") === "nonproport" 53 emph.attr("class", "emph render-#{emph.attr("render")}") 54 emph.tagName("code") 55 emph.removeAttr("render") 56 57 # set a class so CSS can style based on the render value 58 else 59 emph.attr("class", "emph render-#{emph.attr("render")}") 60 emph.removeAttr("render") 61 end 62 end 63 end 64 document.toString() 65 end 66 67 end