# -*- coding: utf-8 -*-

# Gorazd Generator
# Generator of dictionary entries from ALTO XML.
# Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Generators work in the following way. First they read the input which is either ALTO XML
(plus additional txt files in case of SJS) or Word XML and then convert it to the parser input where we basically
just encode XML elements using simpler text format so that it is easily parsed by our parsers. This is the job of
a preprocessor. In the case of SJS this preprocesser also tries to correct as many OCR errors as possible. For fixing
the old Greek diacritics we are using txt files produced by ABBY Finereader alongside of ALTO XML produced by ABBY RS4.

The parser input is then parsed by parsers which are generated by ANTLR from our grammar definition that is contained
in ../grammar. The resulting parsed tree is finally converted into XML and serialized into a disk file whose path
is then handed over to appropriate psotprocessor whose job is mainly split the result into files
(one dictionary entry per one file) and prepare the MARC XMl for upload into Invenio. The postprocessor also contains
various bulk editing operations (implemented via XPATHs) which save quite a lot of human work.
"""

from __future__ import unicode_literals

import codecs
import gc
import glob
import logging
import os
import re
from abc import ABCMeta
from itertools import izip, chain, imap
from lxml import etree as et

from antlr4 import ErrorNode, TerminalNode, InputStream, CommonTokenStream
from antlr4.tree.Trees import Trees
from grammar.RSIRecordLexer import RSIRecordLexer
from grammar.RSIRecordParser import RSIRecordParser
from grammar.SJSRecord2Lexer import SJSRecord2Lexer
from grammar.SJSRecord2Parser import SJSRecord2Parser
from grammar.SNSPRecordLexer import SNSPRecordLexer
from grammar.SNSPRecordParser import SNSPRecordParser
from os.path import join

from postprocess import SJSPostprocessor, RSIPostprocessor, SNSPPostprocessor
from preprocess import SJSPreprocessor, RSIPreprocessor
from utils import create_path, nwise, savexml


class Generator(object):
    """
    This abstract class is responsible for turning the source into Gorazd XML.
    The source can be either ALTO XML or XML from Microsoft Word.
    Concrete generators are instances of classes that inherit from Generator
    and which override only the init method where they set appropriate pre/post-process objects.
    Therefore it is very easy to refactor the code so that concrete generators are instances of `Generator` with a lot of
    configuration options passed to `Generator.__init__`.

    TODO: describe the parsing process
    """
    __metaclass__ = ABCMeta

    def __init__(self, input_dir='', output_dir='', preprocess=True, **kwargs):
        self.name = 'Generator'  # used in logging
        self.version = '1'  # used in logging
        self.lexer = None  # autogenerated by ANTLR
        self.parser = None  # autogenerated by ANTLR
        # dict where we store information that is passed to postprocessor, see `extract_metadata` in `parse` method
        self.metadata = None
        self.namespaces = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'}
        self.preprocess = preprocess
        if preprocess:
            self.preprocessor = None
        self.postprocessor_cls = None
        self.input_dir = input_dir
        self.log = logging.getLogger(__name__)
        self.file_masks = ["*.xml"]
        self.output_dir = output_dir
        create_path(self.output_dir)
        self.intermediate = join(self.output_dir, 'intermediate/')  # we store intermediate results for debugging
        create_path(self.intermediate)
        self.attributes = []  # attributes of the ALTO XML elements that we copy over to Gorazd XML
        # et.register_namespace('', 'http://www.loc.gov/standards/alto/ns-v3#')

        # the following four fields describe the structure of our intermediate text representation
        self.opening = ""
        self.FS = ''  # Field Separator
        self.closing = ""
        self.RS = ''  # Record Separator

        # these fields have special meaning in the parsing process 
        self.special_tags = [self.RS]
        self.separators = []
        self.stopphrases = []

    def run(self):
        self.log.info("Generating records for files from %s" % self.input_dir)
        files = sorted(f for mask in self.file_masks for f in glob.glob(join(self.input_dir, mask)))
        self.log.info("Found the following files: %s" % files)
        if self.preprocess:
            files = self.preprocessor.run(files)
        files = self.parse(files)
        postprocessor = self.postprocessor_cls(output_dir=self.output_dir, metadata=self.metadata,
                                               result_dir=join(self.output_dir, "results"))
        postprocessor.run(files)

    def xml2input(self, xml, p):
        '''
        Transforms xml file from path p to the parser input.
        The path is here just so that we can save the intermediate debugging file with correct filename.
        '''
        self.log.info("Transforming XML to input file for grammar parser")
        # We translate all elements whose tag is alto:TAG to a string TAG.
        STRING_TAG = et.QName(self.namespaces['alto'], "String").text
        STOPWORDS = map(lambda x: unicode(x).split(), self.stopphrases)
        # STOPWORDS.sort(key=lambda x: -len(x))
        STOPWORDS.sort(reverse=True)

        def isolate_separators(l):
            """
            Generator that yields elements from the iterator l and examines their CONTENT attribute.
            If the element contains some seperators it is split into several elements accordingly which are then yielded.
            :param l: iterator
            :return: Element
            """
            for e in l:
                c = e.get("CONTENT")
                if c:
                    if c[0] in self.separators:
                        p = c[0]
                        c = c[1:]
                        yield (et.Element("String", attrib={"CONTENT": p}))
                    queue = []
                    while c and (c[-1] in self.separators):
                        queue.append(et.Element("String", attrib={"CONTENT": c[-1]}))
                        c = c[:-1]
                    if c:
                        e.set("CONTENT", c)
                        yield (e)
                    while queue:
                        e = queue.pop()
                        yield (e)
                else:  # no CONTENT
                    yield (e)

        def element2input(e):
            """
            Turns element into its string representation for the parser input.
            :param e: Element
            :return: unicode
            """
            if e.tag in self.special_tags:
                return e.tag
            else:
                content = e.get('CONTENT', '')
                if content in self.separators:
                    return unicode(content)
                else:
                    res = self.opening
                    for a in self.attributes:  # we need to preserve order of fields
                        res += e.get(a, u'') + self.FS
                    return res[:-1] + self.closing

        def transform(l):
            """
            Transforms all elements from the iterator l into parser input.
            :param l: iterator
            :return: list(unicode)
            """

            def match_sequences(ws, es):
                '''
                Check if the word sequence ws matches the contents of elements from sequence es.
                :param ws: list
                :param es: tuple
                :return: boolean
                '''
                for i in range(len(ws)):
                    if ws[i] != es[i].get("CONTENT", None):
                        return False
                return True

            l = nwise(isolate_separators(l),
                      len(max(STOPWORDS, key=len)),
                      fillvalue=et.Element("FILL", attrib={"CONTENT": "FILL"})
                      )
            ignored = []
            for es in l:
                for ws in STOPWORDS:
                    if match_sequences(ws, es):
                        # we have matched word of length len(ws) and we must ignore elements where it came from
                        for i in range(len(ws)):
                            ignored.append(es[i])
                        yield u" ".join(ws)
                        break
                else:
                    if es[0] in ignored:
                        ignored.pop(0)  # should be at the beginning
                    else:
                        yield element2input(es[0])

        self.log.debug("Transforming file for parser %s" % p)
        path, filename = os.path.split(p)
        tags = self.special_tags + [STRING_TAG]
        res = u'\n'.join(transform(xml.iter(*tags)))

        # for e in xml.iter(*tags):
        #     if e.tag in SPECIAL_TAGS:
        #         res += e.tag + "\n"
        #     else: # it's a String tag
        #         res += element2input(e) + "\n"

        out_path = join(self.intermediate, filename[:-4] + '-grammar-input.txt')
        with codecs.open(out_path, 'w', encoding='utf-8') as f:
            # self.log.debug("Writing into file %s" % out_path)
            f.write(res)
        return out_path, res

    def input2xml(self, text_element, attrib=None):
        """
        Turns parser input into lxml Element.
        :param text_element: unicode
        :param attrib: dict
        :return:
        """

        def input2dict(text_element):
            tmp = text_element.strip().strip(self.opening + self.closing)
            return dict(zip(self.attributes, tmp.split(self.FS)))

        if not attrib:
            attrib = {}
        attrib.update(input2dict(text_element))
        return et.Element(u"String", attrib)
        # return et.Element(u'Test', attrib={'ATTRIBUTY': str(attrib), 'NODE_CONTENT': text_element})

    def filter_child(self, node):
        """
        Returns true if node's tag matches those of word literals.
        :param node:
        :return:
        """
        forbidden = []
        forbidden.append(re.compile(r"[a-z][a-z]_SLOVO", re.UNICODE))
        forbidden.append(re.compile(r"slovo"))
        return any(map(lambda x: x.search(node.tag), forbidden))

    def tree2xml(self, t, ruleNames=None, recog=None):
        """
        Turns parsed tree into XML tree.
        :param t: TreeNode
        :param ruleNames:  ParserRule to use
        :param recog: Parser
        :return: Element, root of the tree
        """
        if recog is not None:
            ruleNames = recog.ruleNames
        s = Trees.getNodeText(t, ruleNames, recog)
        if isinstance(t, ErrorNode) and s:
            if s[0] == '<':  # TODO handle this case better so we can use < as opening symbol
                return et.Element('Error', {'CONTENT': s})
            elif s[0] == self.opening:
                return self.input2xml(s, {'ERROR': '1'})
            elif s in self.special_tags:
                return et.Element(s)
            else:
                return et.Element('String', {'ERROR': '2', 'CONTENT': s})
        elif isinstance(t, TerminalNode):
            if s and s[0] == self.opening:
                return self.input2xml(s)
            elif s.strip() in self.special_tags:
                return et.Element(s.strip())
            else:
                ret = et.Element("text_hesla")  # all String elements must be in a container
                s = et.Element('String', {'CONTENT': s})
                ret.append(s)
                return ret

        name = Trees.getNodeText(t, ruleNames, recog)
        # parent_id = str(uuid.uuid4())
        # parent = et.Element(name, {'ID': parent_id})
        parent = et.Element(name)
        for child_tree in t.getChildren():
            child = self.tree2xml(child_tree, ruleNames, recog)
            if not self.filter_child(child):
                # child.set('PARENT_ID', parent_id)
                parent.append(child)
            else:  # we are skipping this element but we don't want to lose it's content
                for grandchild in child:
                    parent.append(grandchild)
        return parent

    def parse(self, files, rule="slovnik"):
        """
        Join and preprocesses files, parses the resulting file, saves the output and returns path to the result.
        :param files: list(str)
        :param rule: ParserRule
        :return: str
        """
        self.log.info("Parsing...")

        def extract_metadata(file_xml, page_id):
            """
            Extracts ALTO metadata from file_xml and returns them with given page_id attribute
            which is constructed from filename.
            :param file_xml: Element or ElementTree
            :param page_id: str
            :return: ELement
            """

            def copy_element(path):
                e = file_xml.find(path, namespaces=self.namespaces)
                return et.Element(e.tag, e.attrib, self.namespaces)

            alto = et.Element("alto", attrib={"id": page_id, "file": file_xml.get("filename")})
            try:
                alto.append(copy_element(".//alto:Description"))
                # copy the layout elements
                layout = copy_element('.//alto:Layout')
                alto.append(layout)
                page = copy_element('.//alto:Layout/alto:Page')
                layout.append(page)
                layout_elements = 'TopMargin LeftMargin RightMargin BottomMargin PrintSpace'.split()
                for xpath in map(lambda x: './/alto:Layout/alto:Page/alto:' + x, layout_elements):
                    # page.append(copy_element(xpath))
                    e = file_xml.find(xpath, namespaces=self.namespaces)
                    if e is not None:
                        page.append(et.Element(e.tag, e.attrib, self.namespaces))
                    else:
                        self.log.error("XPath %s produced no results in %s, %s" % (xpath, file_xml, page_id))
            except Exception:
                self.log.warning("No ALTO metadata in %s" % file_xml.get('filename'), exc_info=True)
            return alto

        if self.preprocess:
            files_xml = list(et.parse(files[0]).getroot())  # file elements holding contents of original files
            filenames = [xml.get("filename") for xml in files_xml]
            self.metadata = {}  # let's make it class variable because we need it during postprocess
            for file in files_xml:
                page_id = os.path.split(file.get('filename'))[1][:-4]
                self.metadata[page_id] = extract_metadata(file, page_id)
        else:
            files_xml = map(lambda x: x.getroot(), map(et.parse, files))  # parse the content directly
            filenames = files

        print_spaces = (xml.find(".//alto:PrintSpace", self.namespaces) for xml in files_xml)

        self.log.info("Joining files: %s" % filenames)
        transformed_files = []
        input = []
        for file_xml, file_path in izip(print_spaces, filenames):
            filename, content = self.xml2input(file_xml, file_path)  # TODO check this for files which are not processed
            transformed_files.append(filename)
            input.append(content)

        input = u'\n'.join(input)

        joined_file = join(self.intermediate, 'grammar-input.txt')
        with codecs.open(joined_file, encoding='utf-8', mode='w') as f:
            self.log.debug("Writing into file %s" % joined_file)
            f.write(input)

        root = self.parse_file(joined_file, rule=rule)

        output_path = join(self.output_dir, 'parse-result.xml')
        savexml(root, output_path)

        return [output_path]

        # for record in xml:
        # savexml(record, record.get("filename"))

    def parse_file(self, file_path, rule='slovnik'):
        """
        Parses file from file_path starting at rule.
        :param file_path: str
        :param rule: str
        :return: Element
        """
        self.log.info("Parsing file %s" % file_path)
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        chunks = map(lambda x: self.RS + "\n" + x, text.split(self.RS + '\n'))
        forest = []
        i = 0
        for chunk in filter(lambda x: x, chunks):  # filter out empty chunks
            i += 1
            input = InputStream(chunk)
            lexer = self.lexer(input)
            stream = CommonTokenStream(lexer)
            parser = self.parser(stream)
            tree = getattr(parser, rule)()  # tree = parser.slovnik()
            root = self.tree2xml(tree, recog=parser)
            forest.append(root)
            del (input)
            del (lexer)
            del (stream)
            del (parser)
            del (tree)
            if i % 100 == 0:
                gc.collect()
                self.log.debug("Parsing chunk %d" % i)
        if forest:
            forest[0].extend(chain.from_iterable(imap(list, forest[1:])))
            return forest[0]
        else:
            self.log.error("The parser didn't produce anything!")
            return None


class SNSPGenerator(Generator):
    def __init__(self, input_dir='', output_dir='', preprocess=True):
        super(SNSPGenerator, self).__init__(input_dir, output_dir, preprocess)
        self.name = 'SNSPGenerator'
        self.version = '1'
        self.log = logging.getLogger("%s.%s.%s" % (__name__, self.name, self.version))
        self.lexer = SNSPRecordLexer
        self.parser = SNSPRecordParser
        self.file_masks = ["*.htm", "*.html"]
        self.preprocessor = RSIPreprocessor(self.output_dir)
        self.postprocessor_cls = SNSPPostprocessor

        self.attributes = ['ALPHABET', 'STYLE', 'FONTSIZE', 'CONTENT', 'PAGE_ID', 'LANG', 'FONT', 'aip-index']
        self.opening = "$"
        self.FS = "|"
        self.closing = "$"
        self.RS = "RS"
        self.special_tags = [self.RS, "INDENT", "PAR"]
        self.separators = [u',', u'v.']  # TODO bude fungovat separator delky dva?
        self.slovni_druhy = ['m.', 'f.', 'n.', 'adj.', 'poss.', 'pron.', 'num. nota', 'num.', 'ipf.', 'pf.', 'praep.',
                             'conj.', 'part.', 'interj.', 'ptc.', 'nom.', 'propr.']

        # indecl. patri do gramaticke_zarazeni, viz https://basecamp.com/2082305/projects/11816432/todos/297113070
        self.stopphrases = self.slovni_druhy + ['indecl.']
        self.log.info("Generator initialized")


class RSIGenerator(Generator):
    def __init__(self, input_dir='', output_dir='', preprocess=True):
        super(RSIGenerator, self).__init__(input_dir, output_dir, preprocess)
        self.name = 'RSIGenerator'
        self.version = '1'
        self.log = logging.getLogger("%s.%s.%s" % (__name__, self.name, self.version))
        self.lexer = RSIRecordLexer
        self.parser = RSIRecordParser
        self.file_masks = ["*.htm", "*.html"]
        self.preprocessor = RSIPreprocessor(self.output_dir)
        self.postprocessor_cls = RSIPostprocessor

        self.attributes = ['ALPHABET', 'STYLE', 'FONTSIZE', 'CONTENT', 'PAGE_ID', 'LANG', 'FONT', 'aip-index']
        self.opening = "$"
        self.FS = "|"
        self.closing = "$"
        self.RS = "RS"
        self.special_tags = [self.RS, "INDENT", "PAR"]
        self.separators = [u',', u'\u2192']  # u2192 = →
        self.slovni_druhy = ['m', 'f', 'n', 'adj', 'pron', 'num nota', 'num', 'adv', 'conj', 'praep', 'part', 'interj',
                             'propr', 'indecl']
        self.stopphrases = self.slovni_druhy
        self.log.info("Generator initialized")


class SJSGenerator(Generator):
    def __init__(self, input_dir='', output_dir='', preprocess=True):
        super(SJSGenerator, self).__init__(input_dir, output_dir, preprocess)
        self.name = 'SJSGenerator'
        self.version = '1'
        self.log = logging.getLogger("%s.%s.%s" % (__name__, self.name, self.version))
        self.lexer = SJSRecord2Lexer
        self.parser = SJSRecord2Parser
        self.file_masks = ["*.xml"]
        self.preprocessor = SJSPreprocessor(self.output_dir)
        self.postprocessor_cls = SJSPostprocessor

        self.attributes = ['ALPHABET', 'STYLE', 'FONTSIZE', 'CONTENT', 'HPOS', 'VPOS', 'WIDTH', 'HEIGHT', 'WC',
                           'PAGE_ID', 'LANG', 'FONT']
        self.opening = "{"
        self.FS = '\t'  # Field Separator
        self.closing = "}"
        self.RS = 'BIGSKIP'  # Record Separator
        self.special_tags = [self.RS, 'TAB']
        self.separators = [u'-', ';', ',', ':', u'♦', u'\u26ab', u'\u25CF', u'\u2014']
        self.slovni_druhy = ['m.', 'f.', 'n.', 'adj.', 'poss.', 'pron.', 'num.', 'ipf.', 'pf.', 'praep.', 'conj.',
                             'part.', 'interj.', 'ptc.', 'nom.', 'propr.']
        self.stopphrases = ['Exh.', 'Nota:', 'occurrit in', 'occurrit', 'Cf.', 'cf.', 'CL', 'v.'] + self.slovni_druhy
        self.log.info("Generator initialized")
