# -*- coding: utf-8 -*-

# Gorazd Generator
# Generator of dictionary entries from ALTO XML.
# Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import datetime
import errno
import logging
import os
import re
import subprocess
import sys
import unicodedata
from itertools import izip, tee, islice, izip_longest
from lxml import etree as et, html

import distance
from os.path import join

ALTO_NS = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'}
BIBUPLOAD_BINARY = '/opt/invenio/bin/bibupload'

log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())


class Replacer(object):
    """
    For fast string replacements used in SJS preprocessor.
    http://stackoverflow.com/questions/3367809/efficiently-carry-out-multiple-string-replacements-in-python
    """

    def __init__(self, replacements, chars):
        if chars:
            self.chars = {ord(k): v for k, v in chars.items()}
        else:
            self.chars = None
        self.replacements = replacements
        self.locator = re.compile(u'|'.join(re.escape(s) for s in replacements))

    # doesn't work in case we need to chain replacements
    def _doreplace(self, mo):
        return self.replacements[mo.group()]

    def replace(self, s):
        if self.chars:
            s = s.translate(self.chars)
        return self.locator.sub(self._doreplace, s)


def replace_strings(xml, word_substitutions, char_substitutions=None):
    replacer = Replacer(word_substitutions, char_substitutions)
    for e in xml.xpath(".//alto:String", namespaces=ALTO_NS):
        s = unicode(e.get("CONTENT", u""))
        s = replacer.replace(s)
        e.set("CONTENT", s)


def fix_greek(xml, txt):
    """
    This tries to fix wrong old Greek diacritics in ALTO XML as produced by ABBY Recognition Server 4.
    The txt contains unicode produced by ABBY Finereader which recognizes old Greek diacritics but
    cannot produce ALTO XML.

    Strips greek words of their diacritics and then tries to match them using edit distance.
    For each successful match replaces the Greek word in Element CONTENT attribute with the matched one from the txt.
    :param xml: Element or ElementTree
    :param txt: unicode
    :return: None as it changes the xml tree in memory
    """
    correct_greek = []
    # Finereader concatenates words at linebreak but doesn't lose the "-"
    gr_word = re.compile(u"\\b[\u0370-\u03FF\u1F00-\u1FFF]+-?[\u0370-\u03FF\u1F00-\u1FFF]+\\b", flags=re.UNICODE)
    for match in gr_word.finditer(txt):
        # print('%02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
        correct_greek.append(u"".join(match.group().split("-")))
    log.debug("We have %d greek words" % len(correct_greek))

    # because we look at the second in the pair and we don't want to miss the first String
    string_elements = [et.Element("String", {"CONTENT": u"Τὸἐμὸνἀεροπλοῖονἐγχελείωνπλῆρέςἐστιν"}, encoding='utf-8')]
    string_elements += xml.xpath(".//alto:PrintSpace//alto:String", namespaces=ALTO_NS)
    # string_elements = filter(lambda e: self.gr_re.search(e.get("CONTENT")), string_elements)

    # TODO what if there are more greek words inside the CONTENT attribute?
    # what about Strings with empty CONTENT?
    content = {e: unicode(e.get("CONTENT")) for e in string_elements}

    pairs = izip(string_elements, string_elements[1:])
    o_number = 0

    gr_re = re.compile(u"[\u0370-\u03FF\u1F00-\u1FFF]+", flags=re.UNICODE)
    w_number = 0

    for e1, e2 in pairs:
        if gr_re.search(e2.get("CONTENT")) is None:
            continue
        e1c = remove_diacritics(content[e1])
        # TODO should we bother to check in correct_greek?
        # If it is not correctly recognized in Finereader maybe we are better off with the greek o everywhere.
        if e1c in [u"ο", u"o", u"\u043e"]:
            e1.set('CONTENT', u'\u1F41')
            o_number += 1
        e2content = content[e2]  # should already be in unicode
        match = gr_re.search(e2content)
        if match:
            test_word = remove_diacritics(
                match.group())  # testword can contain more characters. typically commas at the end
            for word in correct_greek:
                if remove_diacritics(word) == test_word:
                    replacement = e2content[:match.start()] + word + e2content[match.end():]
                    if replacement not in e2content:  # if we've actually changed anything, print it out
                        log.debug(u"Replacing: %s --> %s" % (e2content, replacement))
                        w_number += 1
                    e2.set('CONTENT', replacement)
                    correct_greek.pop(correct_greek.index(word))  # index returns index of first occurrence
                    string_elements.pop(string_elements.index(e2))
                    break
    log.info(u"We have produced %d greek \u1F41 and changed %d greek words." % (o_number, w_number))
    log.info(u"We failed to match %d words from TXT file." % len(correct_greek))

    a_number = 0
    log.info(u"Trying approximate matches for:\n %s" % '[' + u", ".join(correct_greek) + ']')
    for word in correct_greek:
        distances = []
        for e in string_elements:
            # TODO tune this parameter, TODO do we need full gr_re here instead of just strip?
            # d = distance.levenshtein(remove_diacritics(word), remove_diacritics(content[e].strip(' ,.;:!')), max_dist=3)
            d = distance.fast_comp(remove_diacritics(word), remove_diacritics(content[e].strip(' ,.;:!')))
            distances.append((float(d) / len(word), e))
        approximate_matches = filter(lambda x: 0.3 >= x[0] >= 0.0, distances)  # TODO tune this parameter
        if approximate_matches:
            d, e = distances[distances.index(min(approximate_matches))]
            ec = content[e]
            match = gr_re.search(ec)
            if match:
                replacement = ec[:match.start()] + word + ec[match.end():]
                e.set('CONTENT', replacement)
                a_number += 1
                log.debug(u"Replacing based on edit distance %4.3f: %s --> %s \t " % (d, ec, replacement))
            else:
                log.warning(u"There is no greek match in %s" % ec)
    log.info(u"Changed %d greek words using edit distance heuristic." % a_number)


def join_xml(output_dir, files):
    '''
    Merges xml files into one and returns its parsed content.
    Input is a list of file paths.
    '''
    acc = et.Element("fileList")
    for p in files:
        log.debug("Reading file: %s" % p)
        file_root = et.Element("File", {'filename': os.path.abspath(p)}, nsmap=ALTO_NS)
        data = et.parse(p).getroot()
        file_root.extend(data)
        acc.append(file_root)
    f_name = join(output_dir, 'joined-files.xml')
    savexml(acc, f_name)
    log.info("Files merged into %s" % os.path.abspath(f_name))
    return acc


def savexml(xml, fname):
    # lxml

    if isinstance(xml, type(et.ElementTree())):
        tree = xml
    else:
        tree = et.ElementTree(xml)
    tree.write(fname, encoding='utf-8', xml_declaration=True, pretty_print=True)


def xml2unicode(xml, pretty_print=True):
    return et.tostring(xml, pretty_print=pretty_print, encoding='utf-8').decode('utf-8')


def remove_diacritics(s):
    # "Decompose the unicode string s and remove non-spacing marks."
    # http://stackoverflow.com/a/4164212 http://stackoverflow.com/a/518232
    return ''.join(c for c in unicodedata.normalize('NFKD', s)
                   if unicodedata.category(c) != 'Mn')
    # nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
    # return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


def nwise(iterable, n=2, fillvalue=None):
    """
    Given an iterable I returns iterable over n-tuples of I.  Missing values are filled-in with fillvalue.
    :param iterable: Iterable
    :param n: int
    :param fillvalue:
    :return: Iterable
    """
    iters = tee(iterable, n)
    for i, it in enumerate(iters):
        next(islice(it, i, i), None)
    return izip_longest(*iters, fillvalue=fillvalue)


def create_path(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            log.warning("The directory %s exists, some files may be overwritten." % path)
        else:
            log.critical("Something wrong happened when creating directories: %s" % path)
            sys.exit(1)


def run_bibupload(generator):  # not tested
    generator.log.info("Running bibupload for %s" % generator.output_dir)
    run_list = [BIBUPLOAD_BINARY, '-i', generator.result, '-N', str(generator)]
    try:
        generator.log.info('Attempting to run: ' + ' '.join(run_list))
        output = subprocess.check_call(run_list)
        generator.log.info(str(output))
        return True
    except OSError as err:
        generator.log.error('Bibupload binary %s not found. (%s)' % (BIBUPLOAD_BINARY, str(err)))
    except subprocess.CalledProcessError as err:
        generator.log.error('Failed to run bibupload with error:')
        generator.log.error(str(err))
        return False


def flatten(xml, xpath):
    """Moves all elements matched by xpath to the same place in parent."""
    for e in xml.xpath(xpath):
        parent = e.getparent()
        e_id = parent.index(e)
        for child in e.iterchildren():
            parent.insert(e_id, child)
        parent.remove(e)


def encapsulate(xml, xpath, tag):
    """Encapsulate all elements matched by xpath by element <tag>"""
    for e in xml.xpath(xpath):
        parent = e.getparent()
        enc = et.Element(tag)
        enc.append(e)
        parent.append(enc)


def word2alto(output_dir, filename):
    """
    Simple transformer of Word documents saved as "Web page, simplified" to (probably nonvalid) ALTO XML.
    :param output_dir: directory to which the transformed file will be saved
    :param filename: file to transform
    :return: filepath of the transformed file
    """

    def get_context(e):
        """
        Gather styling information for element e and returns it as a dictionary.
        :param e: Element
        :return: dict
        """

        def get_fontsize(e):

            def extract_fontsize(s):
                regexp_fontsize = re.compile('font-size:\s*[0-9]+')
                return regexp_fontsize.findall(s)[0][10:]  # 10 s the length of 'font-size:' that is part of the match

            try:
                size = int(extract_fontsize(e.get('style')))
                if size < 9:
                    return "small"
                elif size > 12:
                    return "big"
                else:
                    return "normal"
            except:
                return "normal"

        def get_font_family(span):
            #    REGEXP_FONTFAMILY = re.compile('font-family\:".*"')
            style = span.get('style', "")
            fontlist = ['CyrillicaBulgarian10U', 'Arial Narrow', 'Free Serif']
            for font in fontlist:
                if font in style:
                    return font
            return None

        res = {}
        bold, italics = False, False
        # drop html, body and div elements
        elist = list(reversed(list(e.iterancestors())[:-3])) + [e]
        # print(elist)
        for p in elist:
            tag = p.tag.lower()
            if tag == "sub":
                res['aip-index'] = 'dolni'
            elif tag == "sup":
                res['aip-index'] = 'horni'
            elif tag == 'b':
                bold = True
            elif tag == 'i':
                italics = True
            elif tag == 'span' or tag == 'p':
                res['FONTSIZE'] = get_fontsize(p)
                font = get_font_family(p)
                if font:
                    res["FONT"] = font
        if italics:  # italics has precedence
            res['STYLE'] = 'italics'
        elif bold:
            res['STYLE'] = 'bold'
        return res

    def transform_text(text, context):
        res = []
        #    if text[0] in string.whitespace:
        if text[0] == ' ':
            res.append(et.Element("WS"))
        words = text.split()
        if words:
            context['CONTENT'] = words[0]
            res.append(et.Element("{%s}String" % ALTO_NS['alto'], attrib=context, nsmap=ALTO_NS))
            for w in words[1:]:
                context["CONTENT"] = w
                res.append(et.Element("WS"))
                res.append(et.Element("{%s}String" % ALTO_NS['alto'], attrib=context, nsmap=ALTO_NS))
                #    if text[-1] in string.whitespace:
        if text[-1] == ' ':
            res.append(et.Element("WS"))
        return res

    def transform_paragraph(p):
        res = []
        if 'margin-left' in p.get('style', ''):
            res.append(et.Element("INDENT"))
        if p.tag.lower() == 'p':
            res.append(et.Element("PAR"))
        if p.text:
            context = get_context(p)
            res.extend(transform_text(p.text, context))
        for e in p:
            res.extend(transform_paragraph(e))
        if p.tail:
            context = get_context(p.getparent())
            res.extend(transform_text(p.tail, context))
        return res

    docroot = html.parse(filename).getroot()
    output_filename = join(output_dir, os.path.basename(filename + '-alto.xml'))
    log.info("Transforming Word to alto -> %s" % output_filename)
    rs = et.Element("RS")
    with et.xmlfile(output_filename, encoding='utf-8') as xf:
        with xf.element('file', nsmap=ALTO_NS):
            with xf.element("{%s}PrintSpace" % ALTO_NS['alto'], nsmap=ALTO_NS):
                for p in list(docroot.iter("p")):
                    #                    if p.text_content().strip() == "": # work only if p is HTML element
                    p_text_content = "".join([x for x in p.itertext()])
                    if p_text_content.strip() == "":
                        xf.write(rs)
                    else:
                        xf.write(*transform_paragraph(p))

    # pretty print for DEBUG
    xml = et.parse(output_filename).getroot()
    savexml(xml, output_filename)

    return output_filename


def init_logs(name, log_dir='', console_loglevel=logging.DEBUG, info_file_loglevel=logging.DEBUG,
              err_file_loglevel=logging.WARNING,
              default_level=logging.DEBUG):
    log = logging.getLogger('gorazd_generator')
    log.setLevel(default_level)

    formatter = logging.Formatter(u'%(asctime)s:%(levelname)s:%(name)s> %(message)s', "%Y-%m-%d %H:%M:%S")

    console_handler = logging.StreamHandler()
    console_handler.setLevel(console_loglevel)
    console_handler.setFormatter(formatter)
    log.addHandler(console_handler)

    if log_dir == '':
        log_dir = os.getcwdu()

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)  # TODO umoznit aby log_dir byl zanoreny do neexistujicich adresaru

    timestamp = str(datetime.datetime.now()).replace(' ', '_')
    log_filename = name + '-'
    info_log = join(log_dir, log_filename + timestamp + '.log')
    err_log = join(log_dir, log_filename + timestamp + '.err')

    try:
        info_file_handler = logging.FileHandler(info_log, 'w', 'utf-8')
        # info_file_handler = logging.handlers.RotatingFileHandler(info_log, maxBytes = 1048576, backupCount = 100) #100 of 1MB logs
        info_file_handler.setLevel(info_file_loglevel)
        info_file_handler.setFormatter(formatter)

        err_file_handler = logging.FileHandler(err_log, 'w', 'utf-8')
        # err_file_handler = logging.handlers.RotatingFileHandler(err_log, maxBytes = 1048576, backupCount = 100) #100 of 1MB logs
        err_file_handler.setLevel(err_file_loglevel)
        err_file_handler.setFormatter(formatter)

        log.addHandler(info_file_handler)
        log.addHandler(err_file_handler)

    except IOError as err:
        log.critical("Unable to open file for logging.")
        log.critical(str(err))
        # sys.exit(1)

    log.debug('Logging initialized')


def handle_exception(exc_type, exc_value, exc_traceback):
    """
    Log unhadled exceptions.
    """
    log.critical("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
