aboutsummaryrefslogtreecommitdiffstats
path: root/Bonus/html2latex
diff options
context:
space:
mode:
authorTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:05:14 +0000
committerTatsuya Kinoshita <tats@vega.ocn.ne.jp>2011-05-04 07:05:14 +0000
commit72f72d64a422d6628c4796f5c0bf2e508f134214 (patch)
tree0c9ea90cc53310832c977265521fb44db24a515e /Bonus/html2latex
parentAdding upstream version 0.3 (diff)
downloadw3m-upstream/0.5.1.tar.gz
w3m-upstream/0.5.1.zip
Adding upstream version 0.5.1upstream/0.5.1
Diffstat (limited to '')
-rwxr-xr-xBonus/html2latex517
1 files changed, 517 insertions, 0 deletions
diff --git a/Bonus/html2latex b/Bonus/html2latex
new file mode 100755
index 0000000..898799a
--- /dev/null
+++ b/Bonus/html2latex
@@ -0,0 +1,517 @@
+#!/usr/local/bin/ruby
+
+#
+# HTML to LaTeX converter
+# by A. Ito, 16 June, 1997
+#
+
+require 'kconv'
+
+# configuration
+def gif2eps(giffile,epsfile)
+ cmd = "convert #{giffile} #{epsfile}"
+ STDERR.print cmd,"\n"
+ system cmd
+end
+
+###########################################################################
+class Tag
+ def initialize(str)
+ if str =~ /<(.+)>/ then
+ str = $1
+ end
+ tags = str.split
+ @tagname = tags.shift.downcase
+ @vals = {}
+ tags.each do |t|
+ if t =~ /=/ then
+ tn,tv = t.split(/\s*=\s*/,2)
+ tv.sub!(/^"/,"")
+ tv.sub!(/"$/,"")
+ @vals[tn.downcase] = tv
+ else
+ @vals[t.downcase] = TRUE
+ end
+ end
+ end
+ def tagname
+ return @tagname
+ end
+ def each
+ @vals.each do |k,v|
+ yield k,v
+ end
+ end
+ def switch(k)
+ return @vals[k]
+ end
+end
+
+class TokenStream
+ TAG_START = ?<
+ TAG_END = ?>
+ AMP_START = ?&
+ AMP_END = ?;
+
+ AMP_REPLACE_TABLE = {
+ '&amp;' => '\\&',
+ '&gt;' => '$>$',
+ '&lt;' => '$<$',
+ '&nbsp;' => '~',
+ '&quot;' => '"',
+ }
+ def initialize(file)
+ if file.kind_of?(File) then
+ @f = file
+ else
+ @f = File.new(file)
+ end
+ @buf = nil
+ @bpos = 0
+ end
+
+ def read_until(endsym)
+ complete = FALSE
+ tag = []
+ begin
+ while @bpos < @buf.size
+ c = @buf[@bpos]
+ if c == endsym then
+ tag.push(c.chr)
+ complete = TRUE
+ @bpos += 1
+ break
+ end
+ if c == 10 || c == 13 then
+ tag.push(' ')
+ else
+ tag.push(c.chr)
+ end
+ @bpos += 1
+ end
+ unless complete
+ @buf = @f.gets
+ @bpos = 0
+ break if @f.eof?
+ end
+ end until complete
+ return tag.join('')
+ end
+
+ def get
+ while TRUE
+ if @buf.nil? then
+ @buf = Kconv.toeuc(@f.gets)
+ if @f.eof? then
+ return nil
+ end
+ @bpos = 0
+ end
+ if @buf[@bpos] == TAG_START then
+ return Tag.new(read_until(TAG_END))
+ elsif @buf[@bpos] == AMP_START then
+ return replace_amp(read_until(AMP_END))
+ else
+ i = @bpos
+ while i < @buf.size && @buf[i] != TAG_START && @buf[i] != AMP_START
+ i += 1
+ end
+ r = @buf[@bpos,i-@bpos]
+ if i == @buf.size then
+ @buf = nil
+ else
+ @bpos = i
+ end
+ redo if r =~ /^\s+$/
+ return r
+ end
+ end
+ end
+ public :eof?
+ def eof?
+ @f.eof?
+ end
+ def replace_amp(s)
+ if AMP_REPLACE_TABLE.key?(s) then
+ return AMP_REPLACE_TABLE[s]
+ else
+ return s
+ end
+ end
+end
+
+
+def print_header
+ print '
+\documentstyle[epsf]{jarticle}
+\def\hr{\par\hbox to \textwidth{\hrulefill}}
+\def\pre{\begin{quote}\def\baselinestretch{0.8}\tt\obeylines}
+\def\endpre{\end{quote}}
+\makeatletter
+\@ifundefined{gt}{\let\gt=\dg}{}
+\makeatother
+'
+end
+
+
+class Environ_stack
+ def initialize(*envs)
+ @stack = envs
+ end
+ def action(tag)
+ if tag =~ /^!/ then # comment
+ return ["",nil]
+ end
+ i = @stack.size-1
+ while i >= 0
+ a = @stack[i].action(tag)
+ unless a.nil? then
+ return a
+ end
+ i -= 1
+ end
+ return nil
+ end
+ def pop
+ @stack.pop
+ end
+ def push(env)
+ @stack.push(env)
+ end
+ def top
+ @stack[@stack.size-1]
+ end
+ def dup
+ @stack.push(top.clone)
+ end
+end
+
+
+class Environment
+ def initialize(interp)
+ @silent = FALSE
+ @in_table = FALSE
+ @interp = interp;
+ @align = nil;
+ end
+ def action(tag)
+ return @interp[tag]
+ end
+
+ def flush(tok)
+ if tok.kind_of?(String) then
+ tok = tok.gsub(/&/,"\\&");
+ tok = tok.gsub(/%/,"\\%");
+ tok = tok.gsub(/#/,"\\#");
+ tok = tok.gsub(/\$/,"\\$");
+ tok = tok.gsub(/_/,"\\verb+_+");
+ tok = tok.gsub(/\^/,"\\verb+^+");
+ tok = tok.gsub(/~/,"\\verb+~+");
+ end
+ if @in_table then
+ @table[@table_rows][@table_cols] += tok
+ elsif !@silent then
+ if !@align.nil? && tok =~ /\n$/ then
+ print tok.chop,"\\\\\n"
+ else
+ print tok
+ end
+ end
+ end
+
+ def set_interp(interp)
+ @interp = interp
+ end
+
+ # tag processing methods
+
+ # <TITLE>
+ def do_silent(tag)
+ @silent = TRUE
+ end
+
+ # </TITLE>
+ def undo_silent(tag)
+ @silent = FALSE
+ end
+
+ # <IMG>
+ def img_proc(tag)
+ src = tag.switch('src')
+ newfile = src.sub(/\.GIF/i,".eps")
+ gif2eps(src,newfile)
+ flush "\\epsfile{file=#{newfile}}\n"
+ end
+
+ # <TABLE>
+ def starttable(tag)
+ @table = []
+ @tablespan = []
+ @table_rows = -1
+ @table_cols_max = 0
+ @in_table = TRUE
+ unless tag.switch('border').nil? then
+ @table_border = TRUE
+ else
+ @table_border = FALSE
+ end
+ end
+
+ # <TR>
+ def start_row(tag)
+ @table_rows += 1
+ @table[@table_rows] = []
+ @tablespan[@table_rows] = []
+ @table_cols = -1
+ @colspan = 1
+ end
+
+ # <TD>
+ def start_col(tag)
+ @colspan = tag.switch('colspan')
+ if @colspan.nil? then
+ @colspan = 1
+ else
+ @colspan = @colspan.to_i
+ end
+ @tablespan[@table_rows][@table_cols+1] = @colspan
+ @table_cols += @colspan
+ if @table_cols > @table_cols_max then
+ @table_cols_max = @table_cols
+ end
+ end
+
+ # </TABLE>
+ def endtable(tag)
+ @in_table = FALSE
+ flush "\\begin{tabular}{*{"
+ flush @table_cols_max+1
+ if @table_border then
+ flush "}{|l}|}\n\\hline\n"
+ else
+ flush "}{l}}\n"
+ end
+ for i in 0..@table_rows
+ j = 0
+ while j <= @table_cols
+ span = @tablespan[i][j]
+ if span == 1 then
+ flush @table[i][j]
+ elsif @table_border then
+ form = "|l"
+ if j+span > @table_cols then
+ form = "|l|"
+ end
+ flush "\\multicolumn{"+span.to_s+"}{"+form+"}{"
+ flush @table[i][j+span-1]
+ flush "}"
+ else
+ flush "\\multicolumn{"+span.to_s+"}{l}{"
+ flush @table[i][j+span-1]
+ flush "}"
+ end
+ j += span
+ if j <= @table_cols then
+ flush "&"
+ end
+ end
+ flush "\\\\\n"
+ flush "\\hline\n" if @table_border
+ end
+ flush "\\end{tabular}\n"
+ end
+
+ # <CENTER>
+ def startcenter(tag)
+ if @in_table then
+ flush "\\hfil"
+ else
+ flush "\\begin{center}\n"
+ end
+ end
+
+ # </CENTER>
+ def endcenter(tag)
+ if @in_table then
+ flush "\\hfil"
+ else
+ flush "\\end{center}\n"
+ end
+ end
+
+ # <P>
+ def paragraph(tag)
+ align = tag.switch('align')
+ if align.nil? then
+ flush "\\par\n"
+ @endparagraph = ""
+ else
+ align = align.downcase
+ case align
+ when "left" then
+ flush "\\begin{flushleft}\n"
+ @endparagraph = "\\end{flushleft}\n"
+ when "center" then
+ flush "\\begin{center}\n"
+ @endparagraph = "\\end{center}\n"
+ when "right" then
+ flush "\\begin{flushright}\n"
+ @endparagraph = "\\end{flushright}\n"
+ end
+ end
+ @align = align
+ end
+
+ # </P>
+ def endparagraph(tag)
+ unless @align.nil? then
+ @align = nil
+ flush @endparagraph
+ end
+ end
+end
+
+
+enum_interp = {
+ 'li' => ["\\item ",nil]
+}
+
+item_interp = {
+ 'li' => ["\\item ",nil]
+}
+
+desc_interp = {
+ 'dt' => ["\\item[",nil],
+ 'dd' => ["]\n",nil]
+}
+
+table_interp = {
+ 'tr' => [:start_row,nil],
+ 'td' => [:start_col,nil],
+ '/tr' => ["",nil],
+ '/td' => ["",nil],
+}
+
+para_interp = {
+ '/p' => [:endparagraph ,"pop",TRUE],
+}
+
+main_interp = {
+ 'body' => ["\\begin{document}\n",nil,FALSE],
+ '/body' => ["\\end{document}\n",nil,FALSE],
+ 'head' => ["",nil,FALSE],
+ '/head' => ["",nil,FALSE],
+ 'html' => ["",nil,FALSE],
+ '/html' => ["",nil,FALSE],
+ 'title' => [:do_silent,nil,FALSE],
+ '/title' => [:undo_silent,nil,FALSE],
+ '!' => ["",nil,FALSE],
+ 'h1' => ["\\section{",nil,TRUE],
+ 'h2' => ["\\subsection{",nil,TRUE],
+ 'h3' => ["\\subsubsection{",nil,TRUE],
+ 'h4' => ["\\paragraph{",nil,TRUE],
+ '/h1' => ["}\n",nil,TRUE],
+ '/h2' => ["}\n",nil,TRUE],
+ '/h3' => ["}\n",nil,TRUE],
+ '/h4' => ["}\n",nil,TRUE],
+ 'a' => ["",nil,TRUE],
+ '/a' => ["",nil,TRUE],
+ 'center' => [:startcenter,nil,TRUE],
+ '/center' => [:endcenter,nil,TRUE],
+ 'ol' => ["\\begin{enumerate}\n",enum_interp,TRUE],
+ '/ol' => ["\\end{enumerate}\n","pop",TRUE],
+ 'ul' => ["\\begin{itemize}\n",item_interp,TRUE],
+ '/ul' => ["\\end{itemize}\n","pop",TRUE],
+ 'dl' => ["\\begin{description}\n",desc_interp,TRUE],
+ '/dl' => ["\\end{description}\n","pop",TRUE],
+ 'pre' => ["\\begin{pre}\n",nil,TRUE],
+ '/pre' => ["\\end{pre}\n",nil,TRUE],
+ 'p' => [:paragraph ,para_interp,TRUE],
+ 'br' => ["\\par ",nil,TRUE],
+ 'img' => [:img_proc,nil,TRUE],
+ 'hr' => ["\\hr ",nil,TRUE],
+ 'b' => ["{\\bf\\gt ",nil,TRUE],
+ '/b' => ["}",nil,TRUE],
+ 'strong' => ["{\\bf\\gt ",nil,TRUE],
+ '/strong' => ["}",nil,TRUE],
+ 'dfn' => ["{\\bf\\gt ",nil,TRUE],
+ '/dfn' => ["}",nil,TRUE],
+ 'i' => ["{\\it",nil,TRUE],
+ '/i' => ["}",nil,TRUE],
+ 'address' => ["{\\it",nil,TRUE],
+ '/address'=> ["}",nil,TRUE],
+ 'cite' => ["{\\it",nil,TRUE],
+ '/cite' => ["}",nil,TRUE],
+ 'code' => ["{\\tt",nil,TRUE],
+ '/code' => ["}",nil,TRUE],
+ 'kbd' => ["{\\tt",nil,TRUE],
+ '/kbd' => ["}",nil,TRUE],
+ 'tt' => ["{\\tt",nil,TRUE],
+ '/tt' => ["}",nil,TRUE],
+ 'samp' => ["{\\tt",nil,TRUE],
+ '/samp' => ["}",nil,TRUE],
+ 'em' => ["{\\em",nil,TRUE],
+ '/em' => ["}",nil,TRUE],
+ 'u' => ["$\\underline{\\mbox{",nil,TRUE],
+ '/u' => ["}}$",nil,TRUE],
+ 'sub' => ["${}_\mbox{",nil,TRUE],
+ '/sub' => ["}$",nil,TRUE],
+ 'sup' => ["${}^\mbox{",nil,TRUE],
+ '/sup' => ["}$",nil,TRUE],
+ 'table' => [:starttable, table_interp,TRUE],
+ '/table' => [:endtable, "pop",TRUE],
+ 'font' => ["",nil,TRUE],
+ '/font' => ["",nil,TRUE],
+}
+
+
+
+
+################################ MAIN ####################################
+
+$in_document = FALSE
+print_header
+intp = Environ_stack.new(Environment.new(main_interp))
+f = TokenStream.new(ARGV[0])
+until f.eof?
+ tok = f.get
+ if tok.kind_of?(Tag) then
+ case tok.tagname
+ when "body"
+ $in_document = TRUE
+ when "/body"
+ $in_document = FALSE
+ end
+ act = intp.action(tok.tagname)
+ if act.nil? then
+ STDERR.print "tag ",tok.tagname," ignored\n"
+ else
+ if act[2] && !$in_document then
+ print "\\begin{document}\n"
+ $in_document = TRUE
+ end
+ # environment push
+ if act[1].kind_of?(Hash) &&
+ (tok.tagname != "p" || tok.switch('align') != nil) then
+ intp.dup
+ intp.top.set_interp(act[1])
+ end
+
+ if act[0].kind_of?(String) then
+ intp.top.flush act[0]
+ elsif act[0].kind_of?(Fixnum) then # interned symbol
+ intp.top.send(act[0],tok)
+ end
+
+ # environment pop
+ if act[1] == "pop" then
+ intp.pop
+ end
+ end
+ elsif !tok.nil? then
+ intp.top.flush tok
+ end
+end
+if $in_document then
+ print "\\end{document}\n"
+end