(.*?)

#! /usr/bin/env python
"""
Poor man's Sage to TeX converter.

The script converts some basic html to TeX sequences

USAGE:
./sage2tex file.sws

OUTPUT:
output is in file sage_worksheet.pdf

Works for limited set of command, only.

By default script unpacks the file into sage_worksheet directory,
makes some substitutions and compiles by pdfLaTeX. Then we clean
the sage_worksheet directory.

WARNING:
If you have directory sage_worksheet or files sage_worksheet.*, these
files will be silently overwritten! You may want to uncomment the
lines below.

AUTHORS: Robert Marik,
         Wilfried Huss
"""

preamble = r"""
\documentclass{article}
\usepackage{xcolor,amsmath,graphics,hyperref}
\usepackage[margin=1in]{geometry}
\usepackage[utf8]{inputenc}
\usepackage{amssymb}
\usepackage{fancyvrb}
\def\optmath#1{{}$#1${}}
\def\startcell[#1]{\xdef\cellnumber{#1}}
\long\def\startoutput#1{\par {\color{blue}#1}\par\bigskip}
\def\endcell{\par \color{black}}
\def\sagegraphics#1#2{\includegraphics{sage_worksheet/cells/#1/#2}}
\def\span[#1]{\expandafter\ifx\csname #1\endcsname\relax \def\temp{\message{define #1}} \else \expandafter\let\expandafter\temp\csname #1\endcsname\fi\temp}
\def\declarespan#1#2{\expandafter\def\csname #1\endcsname{#2}}
\declarespan{font-family: courier new,courier;}{\texttt}
\parindent=0pt
\parsep=10pt
\everymath{\def\_{\sb}\def\^#1#2{\sp{#2}}\displaystyle}

\usepackage{attachfile}
"""

tag_replacements = {
    "<p>": "\n\n",
    "</p>": "\n\n",
    "<hr />": "\n\n\hrule\n\n",
    "<hr>": "\n\n\hrule\n\n",
    "<br />": "\n\n",
    "<br>": "\n\n",
    "&eacute;": "\\'{e}",
    "&iacute;": "\\'{\i}",
    "&aacute;": "\\'{a}",
    "&uacute;": "\\'{u}",
    "&yacute;": "\\'{y}",
    "&scaron;": "\\v{s}",
    "&auml;": '\\"{a}',
    "&ouml;": '\\"{o}',
    "&uuml;": '\\"{u}',
    "&Auml;": '\\"{A}',
    "&Ouml;": '\\"{O}',
    "&Uuml;": '\\"{U}',
    "&szlig;": '\\ss{}',
    "&nbsp;": "~",
    "<ul>": "\\begin{itemize}",
    "<ol>": "\\begin{enumerate}",
    "&lt;": " < ",
    "</ul>": "\\end{itemize}",
    "</ol>": "\\end{enumerate}",
    "<li>": "\\item ",
    "</li>": ""#,
    #"_": "\_",
    #"^": "\^{ }"
}


import re
import string
import os
import sys

from pygments import highlight
from pygments.lexers import PythonLexer, TexLexer, HtmlLexer
from pygments.formatters import LatexFormatter
from pygments.styles import STYLE_MAP
from pygments.styles import get_style_by_name
from sphinx.highlighting import SphinxStyle

class Colorize:
    def __init__(self, style=SphinxStyle):
        self.pythonlexer = PythonLexer(encoding='utf8')
        self.texlexer = TexLexer(encoding='utf8')
        self.htmllexer = HtmlLexer(encoding='utf8')
        self.formatter = LatexFormatter(noclasses=True, style=style)

    def get_style_defs(self):
        return self.formatter.get_style_defs()

    def __call__(self, s, format = 'python'):
        if format == 'python' or format == 'sage' or format == 'cython':
            return highlight(s, self.pythonlexer, self.formatter)
        elif format == 'latex':
            return highlight(s, self.texlexer, self.formatter)
        elif format == 'html':
            return highlight(s, self.htmllexer, self.formatter)
        else:
            return s

colorize = Colorize()

def line_breaks(line, columns=89):
    r"""
    Split the string into lines of length columns.
    
    TODO: use a more clever algorithm
    """
    newlines = ""
    while len(line) >= columns:
        newlines += line[0:columns] + '\\\n'
        #if line[0:columns].find(' ') == -1:
        #    newlines += '\n'
        line = line[columns:]
    newlines += line
    return newlines

#print colorize("print 1+1")

#if os.path.exists("sage_worksheet"):
   #print "The directory sage_worksheet exists. Exiting."
   #sys.exit(0)

#if os.path.exists("sage_worksheet.tex"):
   #print "The directory sage_worksheet.tex exists. Exiting."
   #sys.exit(0)

#if os.path.exists("sage_worksheet.pdf"):
   #print "The directory sage_worksheet.pdf exists. Exiting."
   #sys.exit(0)

class Cell:
    text = ""
    cell_type = "generic"
    cell_id = -1

    def __init__(self, text, cell_id):
        self.text = text
        self.cell_id = cell_id

    def __str__(self):
        string = 'Type: %s (id = %d)\n' % (self.cell_type, self.cell_id)
        string += '=====================================================\n'
        string += self.text #self.latex()
        string += '=====================================================\n'
        return string

    def latex(self):
        return self.text

class TextCell(Cell):
    cell_type = "text"

    def latex(self):
        latex_str = self.text
        for key,value in tag_replacements.iteritems():
            latex_str = string.replace(latex_str, key, value)

        regular=re.compile(r"<a href=\"#(.*?)\">(.*?)</a>")
        latex_str=regular.sub(r"\2 \\ref{\1}", latex_str)

        regular=re.compile(r"<a name=\"(.*)\"/>")
        latex_str=regular.sub(r"\label{\1}", latex_str)

        regular=re.compile(r"<a name=\"(.*)\"></a>")
        latex_str=regular.sub(r"\label{\1}", latex_str)

        regular=re.compile(r"<font color='black'>(.*?)</font>")
        latex_str=regular.sub(r"\1", latex_str)

        regular=re.compile(r"<span style=\"color: #(.*?);\">")
        latex_str=regular.sub(r"\\textcolor[HTML]{\1}{", latex_str)

        regular=re.compile(r"<span style=\"(.*?)\">")
        latex_str=regular.sub(r"\\span[\1]{", latex_str)
        latex_str=string.replace(latex_str,"</span>","}")

        regular=re.compile(r"<strong>(.*?)</strong>")
        latex_str=regular.sub(r"\\textbf{\1}", latex_str)

        regular=re.compile(r"<em>(.*?)</em>")
        latex_str=regular.sub(r"\\textit{\1}", latex_str)

        regular=re.compile(r"<h1>(.*?)</h1>")
        latex_str=regular.sub(r"\\section{\1}", latex_str)

        regular=re.compile(r"<h2>(.*?)</h2>")
        latex_str=regular.sub(r"\\subsection{\1}", latex_str)

        regular=re.compile(r"<h3>(.*?)</h3>")
        latex_str=regular.sub(r"\\subsubsection{\1}", latex_str)

        regular=re.compile(r"<h4>(.*?)</h4>")
        latex_str=regular.sub(r"\\paragraph{\1}", latex_str)

        regular=re.compile(r"<a href=\"#(.*?)\">(.*?)</a>")
        latex_str=regular.sub(r"\2 \\ref{\1}", latex_str)

        regular=re.compile(r"<a href=\"(.*?)\">(.*?)</a>")
        latex_str=regular.sub(r"\\href{\1}{\2}", latex_str)

        regular=re.compile(r"<font color='black'>(.*?)</font>")
        latex_str=regular.sub(r"\1", latex_str)

        regular=re.compile(r"<html><span class=\"math\">\\newcommand{\\Bold}\[1\]{\\mathbf{#1}}(.*?)</span></html>")
        latex_str=regular.sub(r"$ \1 $", latex_str)

        regular=re.compile(r"<html><span class=\"math\">(.*?)</span></html>")
        latex_str=regular.sub(r"$ \1 $", latex_str)

        regular=re.compile(r"<html><div class=\"math\">(.*?)</div></html>")
        latex_str=regular.sub(r"$$ \1 $$", latex_str)

        regular=re.compile(r"<img src='cell://(.*?)'>")
        latex_str=regular.sub(r"\sagegraphics{\1}", latex_str)

        regular=re.compile(r"<span style=\"color: #(.*?);\">")
        latex_str=regular.sub(r"\\textcolor[HTML]{\1}{", latex_str)

        regular=re.compile(r"<span style=\"(.*?)\">")
        latex_str=regular.sub(r"\\span[\1]{", latex_str)
        latex_str=string.replace(latex_str,"</span>","}")

        regular=re.compile(r"<strong>(.*?)</strong>")
        latex_str=regular.sub(r"\\textbf{\1}", latex_str)

        regular=re.compile(r"<em>(.*?)</em>")
        latex_str=regular.sub(r"\\textit{\1}", latex_str)

        regular=re.compile(r"<h1>(.*?)</h1>")
        latex_str=regular.sub(r"\\section{\1}", latex_str)

        regular=re.compile(r"<h2>(.*?)</h2>")
        latex_str=regular.sub(r"\\subsection{\1}", latex_str)

        regular=re.compile(r"<h3>(.*?)</h3>")
        latex_str=regular.sub(r"\\subsubsection{\1}", latex_str)

        regular=re.compile(r"<h4>(.*?)</h4>")
        latex_str=regular.sub(r"\\paragraph{\1}", latex_str)

        regular=re.compile(r"<a href=\"#(.*?)\">(.*?)</a>")
        latex_str=regular.sub(r"\2 \\ref{\1}", latex_str)

        regular=re.compile(r"<a href=\"(.*?)\">(.*?)</a>")
        latex_str=regular.sub(r"\\href{\1}{\2}", latex_str)

        return latex_str

class InputCell(Cell):
    cell_type = 'input'
    language = 'sage'

    def __init__(self, text, cell_id):
        self.text = text
        self.cell_id = cell_id
        self._detect_language()

    def _detect_language(self):
        first_line = self.text.splitlines()[0]
        if first_line == '%latex':
            self.language = 'latex'
        elif first_line == '%html':
            self.language = 'html'
        elif first_line == '%python':
            self.language = 'python'
        elif first_line == '%cython':
            self.language = 'cython'
        elif first_line == '%maxima':
            self.language = 'maxima'
        else:
            self.language = 'sage'

    def latex(self):
        latex_str = colorize(self.text, format=self.language)
        verb_before = r"\begin{Verbatim}[commandchars=@\[\]]"
        verb_after  = r"\begin{Verbatim}[frame=single,label={" + self.language.capitalize() + " code},commandchars=@\[\]]"
        return latex_str.replace(verb_before, verb_after)

class OutputCell(Cell):
    cell_type = "output"
    re_math = re.compile(r"<html><(?P<tag>span|div) class=\"math\">(?:\\newcommand{\\Bold}\[1\]{\\mathbf{#1}})?(.*?)</(?P=tag)></html>")
    re_img = re.compile(r"<img src='cell://(.*?)'>")
    re_font = re.compile(r"<html><font color='black'>(.*?)</font></html>")

    def latex(self):
        is_math = self.re_math.match(self.text)
        if is_math is not None:
            latex_str = self.re_math.sub(r"\startoutput{$\displaystyle \2 $}", self.text)
            latex_str = self.re_img.sub("\\sagegraphics{%d}{\\1}" % self.cell_id, latex_str)
            latex_str = self.re_font.sub(r"\1", latex_str)
        else:
            latex_str = "\\begin{Verbatim}[formatcom=\color{blue}]\n"
            latex_str += line_breaks(self.text)
            latex_str += "\\end{Verbatim}\n"

            latex_str = self.re_img.sub("\\sagegraphics{%d}{\\1}" % self.cell_id, latex_str)
            latex_str = self.re_font.sub(r"\1", latex_str)

            lines = []
            for line in latex_str.split('\n'):
                if line.find(r"\sagegraphics") != -1:
                    lines.append("\\end{Verbatim}")
                    lines.append(line)
                    lines.append("\\begin{Verbatim}[formatcom=\color{blue}]")
                else:
                    lines.append(line)

            latex_str = string.join(lines, '\n')
            replacement_str = "\\begin{Verbatim}[formatcom=\\color{blue}]\n\\end{Verbatim}"
            latex_str = string.replace(latex_str, replacement_str, "")

        return latex_str

def print_cells(cell_list):
    for cell in cell_list:
        print cell

def parse_worksheet(worksheet_text):
    cell_type = 'text'

    cell_list = []
    current_cell = ""
    current_cell_id = -1

    input_cell = re.compile(r'{{{id=(\d*)|\Z')
    output_cell = re.compile('///\n')
    output_cell_end = re.compile('}}}\n')

    for line in worksheet_text:
        is_start_of_input_cell = input_cell.match(line)
        is_start_of_output_cell = output_cell.match(line)
        is_end_of_output_cell = output_cell_end.match(line)

        if is_start_of_input_cell is not None:
            current_cell_id = int(is_start_of_input_cell.group(1))
            if current_cell.strip() != "":
                cell_list.append(TextCell(current_cell, current_cell_id))
            current_cell = ""
            continue

        if is_start_of_output_cell is not None:
            if current_cell.strip() != "":
                cell_list.append(InputCell(current_cell, current_cell_id))
            current_cell = ""
            continue

        if is_end_of_output_cell is not None:
            if current_cell.strip() != "":
                cell_list.append(OutputCell(current_cell, current_cell_id))
            current_cell = ""
            continue

        current_cell += line

    return cell_list

def write_latex_file(cell_list):
    latex_src = preamble
    latex_src += colorize.get_style_defs() + "\n"
    latex_src += """
    \\begin{document}
    \\attachfile[description=You can get the Sage worksheet by clicking this icon.]{%s}
    \\tableofcontents
    """ % str(sys.argv[1])

    for cell in cell_list:
        latex_src += cell.latex()

    latex_src += "\\end{document}"
    return latex_src

result=os.popen("tar xjvf "+sys.argv[1]).read()

worksheet=open("sage_worksheet/worksheet.html",'r')
cell_list = parse_worksheet(worksheet.readlines())
worksheet.close()

#print print_cells(cell_list)
latex_src = write_latex_file(cell_list)

TeX_file=open("sage_worksheet.tex",'w')
TeX_file.write(latex_src)
TeX_file.close()

os.system("pdflatex sage_worksheet.tex && pdflatex sage_worksheet.tex && rm -r sage_worksheet")