# -*- coding: utf-8 -*-

# Gorazd Generator
# Generator of dictionary entries from ALTO XML.
# Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import datetime
import logging
import os
from abc import abstractmethod, ABCMeta
from collections import Counter
from copy import deepcopy
from lxml import etree as et

from os.path import join

from utils import flatten, savexml, nwise, encapsulate, ALTO_NS, create_path, xml2unicode


class Postprocessor(object):
    __metaclass__ = ABCMeta

    def __init__(self, output_dir='out', metadata=None, result_dir=''):
        self.output_dir = output_dir
        if not result_dir:
            self.result_dir = join(self.output_dir, 'intermediate/')
        else:
            self.result_dir = result_dir
        create_path(self.result_dir)
        self.metadata = metadata
        self.namespaces = ALTO_NS
        self.version = "1"
        self.log = logging.getLogger(__name__)
        try:
            self.transform = et.XSLT(et.parse('xsl/gorazd_MARC.xsl'))
        except Exception:
            self.log.error("XSL transformation not found", exc_info=True)
            # identity transform
            self.transform = et.XSLT(et.XML('''
                    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
                      <xsl:template match="@*|node()">
                        <xsl:copy>
                          <xsl:apply-templates select="@*|node()"/>
                        </xsl:copy>
                      </xsl:template>
                    </xsl:stylesheet>
            '''))
        self.xml = et.Element("None")

    def update_elements(self, rename_dict):
        """
        Helper function that gives elements determined by XPATH a new tag and attributes.
        :param rename_dict: dict that has xpaths as keys and whose values are pairs (new_tag, attributes)
        TODO: BUG -- seems it doesn't work when the same xpath is given more than once in rename_dict
        TODO: should go to utils?
        """
        for xpath, replacement in rename_dict.items():
            for e in self.xml.xpath(xpath):
                e.tag = replacement[0]
                e.attrib.update(replacement[1])

    def ensure_header(self):
        """
        Check whether all dictionary entries (aka records) have header that identifies them. If not, add artificial one.
        """
        self.log.info("Ensuring all records have header")
        for heslo in self.xml.xpath(".//heslo"):
            hlavicka = heslo.find(".//hlavicka")
            if hlavicka is None:
                hlavicka = et.Element('hlavicka')
                heslo.insert(0, hlavicka)
            zahlavi = hlavicka.find(".//zahlavi")
            if zahlavi is None:
                alto = heslo.find(".//metadata/alto")
                if alto is not None:
                    page_id = alto.get("id", "")
                else:
                    page_id = ""
                header = et.Element('zahlavi')
                header_str = hlavicka.find("./String")
                if header_str is None or header_str.get("ALPHABET") != "ru":  # TODO get rid of "ru" here
                    timestamp = str(datetime.datetime.now())  # .replace(' ', '/')
                    header_str = et.Element('String', attrib={"CONTENT": "nerozeznano_" + page_id + "_" + timestamp})
                header_str.set("typ", "nerozeznano")
                header.append(header_str)
                hlavicka.insert(0, header)

    def sweep_dangling_strings(self):
        """
        Go through the self.xml and add all top level Strings to records that precede them.
        If there are top level Strings at the beginning, create a dummy record for them.
        """
        self.log.info("Sweeping dangling Strings")
        root = self.xml.getroot()
        # if the very first element is not record
        if len(root) == 0:
            self.log.error("No records found. Empty file?")
        else:
            if "heslo" not in root[0].tag:
                record = et.Element("heslo", attrib={"TEST": "umele heslo"})
                self.xml.getroot().insert(0, record)
            else:
                record = root[0]

            strings = []
            for e in root[1:]:
                if "String" in e.tag:
                    strings.append(e)
                elif "heslo" in e.tag:
                    if strings:
                        record.extend(strings)
                        record = e
                        strings = []
                    else:
                        record = e
                else:
                    # skip other elements and log them
                    self.log.warning("Element %s encountered in the top level at line %s" % (e.tag, e.sourceline))

            # handle strings dangling at the end
            if strings:
                record.extend(strings)

    @abstractmethod
    def run(self, files):
        pass

    def filter_records(self, tag):
        '''
        Filter out records that do not contain the specified tag.
        '''
        self.log.info("Filtering out elements with tag %s" % tag)
        l = []
        for record in self.xml.xpath(".//heslo"):
            if record.find(".//" + tag, namespaces=self.namespaces) is None:
                l.append(record)
        map(lambda record: record.getparent().remove(record), l)

    def fill_lang(self):
        """
        Some elements have default language which is determined by their position in the record tree.
        """
        self.log.info("Filling in LANGUAGE and ALPHABET")
        tag2lang = {
            'zahlavi': 'ocs',
            'nasobne_zahlavi': 'ocs',
            'gramaticke_zarazeni': 'ocs',
            'text_odstavce': 'ocs',  # zde se vykystuje rustina pouze minimalne ale musime zkontrolovat ALPHABET
            'el_preklad': 'el',
            'ru_preklad': 'ru',
            # u techto se musime podivat na prarodice
            'staronemcina': 'goh',
            'cestina': 'cs',
            'nemcina': 'de'
        }
        for e in self.xml.xpath('.//String'):
            parent = e.getparent()
            tag = parent.tag
            if tag in tag2lang:  # all Strings should have parent
                if tag == 'text_odstavce':
                    if e.get('ALPHABET') == 'ru':
                        e.set('LANG', 'ocs')
                else:
                    e.set('LANG', tag2lang[tag])
            else:
                grandparent = parent.getparent()
                if (grandparent is not None) and (grandparent.tag in tag2lang):
                    e.set('LANG', tag2lang[grandparent.tag])

    def remove_elements(self, tag_list):
        """
        Removes elements whose tag is in the tag list.
        :param tag_list: list(str)
        """
        self.log.info("Removing elements: %s" % tag_list)
        for tag in tag_list:
            for e in self.xml.xpath(tag):
                parent = e.getparent()
                parent.remove(e)

    def add_alto_metadata(self):  # TODO better record numbering
        """
        We add metadata info stored from ALTO XML to all records based on page_ids of their elements.
        """
        self.log.info("Adding ALTO metadata")
        # add alto metadata based on page_id that is contained in the record
        for heslo in self.xml.xpath(".//heslo"):
            page_ids = set()
            for e in heslo.iter():  # iterate over all subelements
                page_ids.add(e.get("PAGE_ID"))
            metadata = et.Element("metadata", {'generator_ver': self.version})
            heslo.append(metadata)
            for page_id in page_ids:
                if page_id:
                    metadata.append(deepcopy(self.metadata[page_id]))

    def save_errorneous_records(self):
        """
        Copy/save records containing errors to separate files for easier debugging.
        """
        self.log.info("Saving errorneous records")
        def get_record(e):
            res = list(e.iterancestors("heslo"))
            if res:
                return res[0]
            else:
                self.log.critical("Element %s doesn't belong to any record!" % xml2unicode(e))
                er = et.Element("ERROR_RECORD")
                er.append(e)
                return er

        records = map(deepcopy, set(get_record(e) for e in self.xml.xpath("//*[@ERROR]")))
        root = et.Element("slovnik")
        root.extend(records)
        self.log.info("Pocet hesel s chybami: %d" % len(root))
        savexml(root, join(self.output_dir, 'errors.xml'))

    def log_statistics(self):
        def stat(xpath):
            return len(self.xml.xpath(xpath, namespaces=self.namespaces))

        self.log.info("-------------- STATISTIKY --------------")
        self.log.info("Pocet hesel: %d" % stat("./heslo"))
        self.log.info(
            "Pocet nerozpoznanych zahlavi: %d + %d" % (stat(".//zahlavi/String[starts-with(@CONTENT, 'nerozeznano')]"),
                                                       stat(".//zahlavi/String[@typ='nerozeznano']")))
        self.log.info("Pocet elementu vyskyt: %d" % stat(".//vyskyt"))
        self.log.info("Pocet prekladu do modernich jazyku: %d" % stat(".//moderni_jazyky"))
        self.log.info("Pocet prekladu do starych jazyku: %d" % stat(".//stare_jazyky"))

    def save_records(self, padding=0):
        """
        Saves all records and produces MARC XML for them.
        :param padding: int serves for cases when we need to add leading zeros to filenames.
        :return:
        """
        self.log.info("Saving postprocessed records")
        self.fnames = Counter()
        self.marcs = []

        def create_filename_and_id(record):
            alto = record.find(".//alto")
            if alto is not None:
                path = alto.get("file")
            else:
                self.log.warning("No alto information in record:\n %s" % xml2unicode(record))
                path = "missing_alto.xml"
            fname = os.path.split(path)[1][:-4]
            self.fnames[fname] += 1
            number = self.fnames[fname]
            id_heslo = et.Element("id_heslo")
            id_heslo.text = unicode(number).rjust(padding, "0")
            metadata = record.find("metadata")
            if metadata is not None:
                metadata.append(id_heslo)
            else:
                self.log.warning("No metadata element in record: %s" % record)
            out_path = join(self.result_dir, fname + '-%d.xml' % number)
            return out_path

        for record in self.xml.xpath(".//heslo"):
            out_path = create_filename_and_id(record)
            # self.log.debug("Saving record into %s" % out_path)
            savexml(record, out_path)
            # XSLT to MARC
            marc_file_path = et.XSLT.strparam(os.path.abspath(out_path))
            marc_xml = self.transform(record, gXML_file_path=marc_file_path)
            savexml(marc_xml, out_path[:-4] + '-marc.xml')  # for debugging
            # marc_str = unicode(marc_xml).split("\n", 1)[1] # strip the first line that contains the xml declaration
            self.marcs.append(marc_xml.getroot())

        collection = et.Element("collection")
        collection.extend(self.marcs)
        savexml(collection, join(self.output_dir, 'final-marc.xml'))

    def update_attributes_on_content(self, contents, attrib):
        """
        :param contents: unicode
        :param attrib: dict
        :return:
        """
        xpath = u"//String[" + " or ".join(map(lambda s: u"starts-with(@CONTENT,'" + s + "')", contents)) + "]"
        for e in self.xml.xpath(xpath):
            e.attrib.update(attrib)

    def _adjoin_hyphen(self, xpath, preceding=True):
        """
        Check if element matched by xpath is preceded/succeeded  by hyphen. If yes, adjoin the hyphen. 
        """
        counter = 0
        for e in self.xml.xpath(xpath):
            if preceding:
                h = e.getprevious()
            else:
                h = e.getnext()
            if h is not None:
                if h.tag == 'text_hesla' and len(h) > 0:
                    if preceding:
                        h = h[-1]
                    else:
                        h = h[0]
                if h.get("CONTENT", "") == "-":
                    counter += 1
                    content = e.get("CONTENT", "")
                    if preceding:
                        e.set("CONTENT", "-" + content)
                    else:
                        e.set("CONTENT", content + "-")
                    parent = h.getparent()
                    parent.remove(h)
                    if parent.tag == "text_hesla" and len(parent) == 0:
                        grandparent = parent.getparent()
                        grandparent.remove(parent)
        self.log.debug("Joined %d hyphens for xpath %s" % (counter, xpath))

    def fix_hyphens(self):
        '''
        Adjoin preceding hyphens to String in gramaticke_zarazeni
        https://basecamp.com/2082305/projects/11816432/todos/303802865#comment_518934805
        https://basecamp.com/2082305/projects/11816432/todos/303802007#comment_518932987
        :return: 
        '''
        # this takes care of Strings with hyphen that precede gramaticke_zarazeni
        self.log.info("Fixing hyphens")
        for e in self.xml.xpath("//gramaticke_zarazeni"):
            if e[0].get("CONTENT", "0")[0] == "-":
                pass
            else:
                p = e.getprevious()
                if p is not None and p.tag == "text_hesla":
                    if p.tag != 'String' and len(p) > 0:
                        h = p[-1]
                    else:
                        h = p
                    if h.get("CONTENT", "") == "-":
                        e.insert(0, h)
                    if len(p) == 0:
                        gp = p.getparent()
                        gp.remove(p)

        self._adjoin_hyphen("//gramaticke_zarazeni/String", preceding=True)


class SNSPPostprocessor(Postprocessor):
    def __init__(self, output_dir='out', metadata=None, result_dir=''):
        super(SNSPPostprocessor, self).__init__(output_dir, metadata, result_dir)
        self.transform = et.XSLT(et.parse('xsl/gorazd_MARC_SNSP.xsl'))  # TODO unify XSL into one file
        self.version = "MS.1"
        self.log = logging.getLogger(__name__ + "." + self.version)
        self.log.info("Postprocessor initialized")

    def handle_text_hesla(self):
        self.log.info("Cleanup after grammar parsing -- text_hesla")
        flatten(self.xml, "//slovni_druh/text_hesla")
        flatten(self.xml, "//text_hesla/text_hesla")
        flatten(self.xml, "//gramaticke_zarazeni/text_hesla")
        # all gramaticke_zarazeni that come after frekvence should be text_hesla
        for heslo in self.xml.xpath("//heslo"):
            frekvence = heslo.find(".//frekvence")
            if frekvence is not None:
                for e in frekvence.itersiblings("gramaticke_zarazeni"):
                    e.tag = "text_hesla"
        # see https://basecamp.com/2082305/projects/11816432/todos/268951261#comment_511395500
        for slovni_druh in self.xml.xpath("//odstavec//slovni_druh"):
            slovni_druh.tag = "text_hesla"

    def handle_frekvence(self):
        """
        Only the first element frekvence survives, the rest are turned into text_element
        """
        self.log.info("Getting rid of superfluous elements frekvence")
        for heslo in self.xml.xpath("//heslo"):
            i = 0
            for e in heslo.iter("frekvence"):
                if i > 0:
                    e.tag = "text_hesla"
                i += 1

    def interpunction(self):
        '''
        Dvojtečku, středník, čárku, dlouhou pomlčku za textem oddělit jako samostatný string.
        Dvojtečku za číslicí a mezi číslicemi  nechat ve stejném stringu.
         
         viz https://basecamp.com/2082305/projects/11816432/todos/303176442
        '''
        self.log.info("Fixing interpunction")
        characters = u':—;,-'
        res = []
        for e in self.xml.xpath("//String"):
            c = e.get("CONTENT", "")
            if len(c) > 1 and c[-1] in characters and not (unicode.isdigit(unicode(c[-2])) and c[-1] == ":"):
                res.append(e)
        for e in res:
            c = e.get("CONTENT")
            ne = et.Element("String", {"CONTENT": c[-1]})
            e.set("CONTENT", c[:-1])
            parent = e.getparent()
            parent.insert(parent.index(e) + 1, ne)

    def add_fonts(self):
        """https://basecamp.com/2082305/projects/11816432/todos/303163327"""
        self.log.info("Adding FONTs")
        for e in self.xml.xpath("//String"):
            if e.get("FONT", "") != "CyrillicaBulgarian10U":
                e.set("FONT", "FreeSerif")

    def fix_hyphens(self):
        '''
        Pokud krátká pomlčka "-" stojí před elementem "gramatické zařazení", má být jeho součástí bez mezery.
        Pokud krátká pomlčka "-" stojí za slovem v cyrilici, má být jeho součástí bez mezery.
        https://basecamp.com/2082305/projects/11816432/todos/303802007#comment_518932987 
        '''
        super(SNSPPostprocessor, self).fix_hyphens()
        self._adjoin_hyphen("//text_hesla/String[@ALPHABET='ru']", preceding=False)

    def merge_text_hesla(self):
        self.log.info("Merging consecutive text_hesla")
        for th in reversed(self.xml.xpath("//text_hesla")):
            p = th.getprevious()
            if p is not None and p.tag == "text_hesla":
                p.extend(th)
                th.getparent().remove(th)

    def fix_gramaticke_zarazeni(self):
        """
        Element "gramatické zařazení" se smí vyskytovat pouze před elementem "slovní druh"
        https://basecamp.com/2082305/projects/11816432/todos/303802476       
        """
        self.log.info("Fixing gramaticke_zarazeni")
        for heslo in self.xml.xpath(".//heslo"):
            slovni_druhy = heslo.xpath(".//slovni_druh")
            if slovni_druhy:
                last_sl_druh = slovni_druhy[-1].getparent().index(slovni_druhy[-1])
                for gr in heslo.xpath(".//gramaticke_zarazeni"):
                    if gr.getparent().index(gr) > last_sl_druh:
                        gr.tag = "text_hesla"

    def fix_header_split(self):
        """
        Vše, co je před prvním výskytem znaku "]" je součástí hlavičky; vše, co je za ním, je součástí odstavce.
        Pokud znak "]" není přítomen, končí hlavička elementem frekvence. 
        https://basecamp.com/2082305/projects/11816432/todos/303801844 
        """
        def get_top(n):
            while n is not None and n.getparent() is not None and n.getparent().tag != "hlavicka":
                n = n.getparent()
                if n is None:
                    self.log.error("No hlavicka found in record %s" % xml2unicode(heslo))
                    break
            return n

        self.log.info("Fixing header splits")
        hdr_counter, str_counter = 0, 0
        for heslo in self.xml.xpath("//heslo"):
            frekvence = heslo.find("hlavicka/frekvence")
            if frekvence is None:
                candidates = heslo.xpath("(.//hlavicka//String[starts-with(@CONTENT, '[')])[1]")
                if candidates:
                    n = get_top(candidates[0])
                else:
                    continue
            else:
                n = frekvence.getnext()

            if n is not None and len(n) > 0:  # there are some things after element frekvence
                c = n[0].get("CONTENT", "")
                if n[0].tag != "String":
                    self.log.error(u"Unexpected element while fixing header:\n %s" % xml2unicode(n))
                if len(c) > 0 and c[0] == "[":  # we must find the closing bracket
                    endings = n.xpath("following::String[contains(@CONTENT, ']')][1]")
                    if not endings:
                        continue
                    n = endings[0]
                    leftover_strings = n.xpath("following-sibling::String")
                    n = get_top(n)
                    if n is not None:
                        n = n.getnext()
                    else:
                        continue
                else:
                    leftover_strings = []

                if n is not None:
                    # n is now the first element that should not be in the header
                    # let's collect all Strings that maybe left to move in the element before n
                    if len(leftover_strings) > 0:
                        text_hesla = et.Element("text_hesla")
                        transfer = [text_hesla, n]
                    else:
                        transfer = [n]
                    transfer.extend(n.xpath("following-sibling::*"))
                    for e in transfer:
                        e.tag = "text_hesla"

                        # move the elements to odstavec (and create it if it doesn't exists)
                    if len(transfer) > 0:
                        str_counter += sum(len(x) for x in transfer)
                        hdr_counter += 1
                        hlavicka = heslo.find("hlavicka")
                        odstavec = heslo.find("odstavec")
                        if odstavec is None:
                            self.log.debug(u"There is no odstavec following:\n %s" % xml2unicode(hlavicka))
                            odstavec = hlavicka.makeelement("odstavec")
                            hlavicka.addnext(odstavec)
                        i = 0
                        for e in transfer:
                            odstavec.insert(i, e)
                            i += 1
        self.log.info("Moved %d strings from %d headers" % (str_counter, hdr_counter))

    def run(self, files):
        for file_path in files:
            self.xml = et.parse(file_path)

            self.filter_records("String")
            replacements = {
                './/heslo': ['heslo', {'typ': 'hlavni', 'slovnik': 'SNSP'}],
                './/odkazove_heslo': ['heslo', {'typ': 'odkazove', 'slovnik': 'SNSP'}],
                './/odkazova_hlavicka': ['hlavicka', {}],
                './/odkazove_zahlavi': ['zahlavi', {}],
                './/nasobne_odkazove_zahlavi': ['nasobne_zahlavi', {}],
                './/hlavicka_text_hesla': ['text_hesla', {}],
                './/nerozpoznano': ['text_hesla', {}],
            }
            self.update_elements(replacements)

            self.remove_elements(['//RS', '//INDENT', '//PAR'])
            self.handle_frekvence()
            self.handle_text_hesla()
            # self.fill_lang(xml)
            self.add_alto_metadata()
            self.filter_records("alto")
            self.sweep_dangling_strings()
            self.ensure_header()  # we need to run this after we rename the record element and add the metadata

            self.interpunction()
            self.fix_gramaticke_zarazeni()
            self.fix_header_split()  # must be run after handle_frekvence and before merge_text_hesla
            self.merge_text_hesla()  # must be run before fix_hyphens
            self.fix_hyphens()
            self.add_fonts()

            output_path = file_path[:-4] + "-postprocessed.xml"
            savexml(self.xml, output_path)

            self.save_errorneous_records()

            # save records to different files
            self.save_records(padding=4)
            self.log_statistics()
        return files


class RSIPostprocessor(Postprocessor):
    def __init__(self, output_dir='out', metadata=None, result_dir=''):
        super(RSIPostprocessor, self).__init__(output_dir, metadata, result_dir)
        self.transform = et.XSLT(et.parse('xsl/gorazd_MARC_RSI.xsl'))  # TODO unify XSL into one file
        self.version = "RSI.1"
        self.log = logging.getLogger(__name__ + "." + self.version)

    def handle_text_hesla(self):
        flatten(self.xml, "//slovni_druh/text_hesla")
        flatten(self.xml, "//text_hesla/text_hesla")
        flatten(self.xml, "//odstavec//slovni_druh")

    def handle_ocs_headers(self):
        for h in self.xml.xpath("//ssl_zahlavi"):
            if len(h) > 0:
                s = h[0].get("CONTENT")
                e = h[-1].get("CONTENT")
                if s and e and s[0] == "[" and e[-1] == "]":
                    h[0].set("CONTENT", s[1:])
                    if h[0] == h[-1]:
                        h[0].set("CONTENT", s[1:-1])
                    else:
                        h[-1].set("CONTENT", e[:-1])
                    h.set("nepresne", "1")
                    for string in h.xpath(
                            ".//String"):  # Editor wants the same attribute for all descendant String elements
                        string.set("nepresne", "1")
                # if the header starts with + split it off and make it a sup script
                e = h[0]
                if e.get("CONTENT", "")[0] == "+":
                    text_hesla = et.Element("text_hesla")
                    plus = et.Element("String", {"aip-index": "horni", "CONTENT": "+"})
                    text_hesla.append(plus)
                    odstavec = h.getparent()
                    odstavec.insert(odstavec.index(h), text_hesla)
                    e.set("CONTENT", e.get("CONTENT")[1:])

    def run(self, files):
        for file_path in files:
            self.xml = et.parse(file_path)

            self.filter_records("String")
            replacements = {
                './/heslo': ['heslo', {'typ': 'hlavni', 'slovnik': 'RSI'}],
                './/odkazove_heslo': ['heslo', {'typ': 'odkazove', 'slovnik': 'RSI'}],
                # './/heslo': ['heslo', {'slovnik': 'RSI'}],
                './/odkazova_hlavicka': ['hlavicka', {}],
                './/*[starts-with(local-name(), "poznamka_")]': ['text_hesla', {}],
                './/indikator_odstavce': ['text_hesla', {}],
                './/text_odstavce': ['text_hesla', {}],
                './/hlavicka_text_hesla': ['text_hesla', {}],
                './/nerozpoznano': ['text_hesla', {}],
                './/confer_odkaz': ['odkaz', {}],
            }
            self.update_elements(replacements)
            # flatten(self.xml, ".//text_hesla/slovni_druh")
            #
            # flatten(self.xml, ".//slovni_druhy/slovni_druh")
            # self.update_elements({'//slovni_druhy': ['slovni_druh', {}]})

            self.handle_ocs_headers()

            self.remove_elements(['//RS', '//INDENT', '//PAR'])
            self.handle_text_hesla()
            # self.fill_lang(xml)
            self.add_alto_metadata()
            self.filter_records("alto")
            self.ensure_header()  # we need to run this after we rename the record element and add the metadata
            self.sweep_dangling_strings()

            # style_updates = {
            #     './/zahlavi/String':
            #         ['String', {'ALPHABET': 'ru',
            #                     'LANG': 'ocs',
            #                     'FONTSIZE': 'big',
            #                     'STYLE': 'bold',
            #                     }],
            # }
            # self.update_elements(style_updates)

            # small_words = ['add.', 'Cf.', 'cf.', 'erronee', 'Exh.', 'fig.', 'It.', 'lacuna', 'liturg.', 'mutilatum',
            #                'neg.', 'om.', 'refl.', 'sc.', 'sic!', 'sim.', 'var.', 'Vg.', '(dat.)', '(gen.)', '(pl.)',
            #                '(sic!)', 'in hoc sensu exh.', 'per errorem', '(It.)', '(Vg.)']
            # self.update_attributes_on_content(small_words, {'FONTSIZE': 'small'})

            # self.handle_indices()

            output_path = file_path[:-4] + "-postprocessed.xml"
            savexml(self.xml, output_path)

            self.save_errorneous_records()

            # save records to different files
            self.save_records(padding=4)
            self.log_statistics()
        return files


class SJSPostprocessor(Postprocessor):
    def __init__(self, output_dir='out', metadata=None, result_dir=''):
        super(SJSPostprocessor, self).__init__(output_dir, metadata, result_dir)
        self.version = "SJS.1"
        self.log = logging.getLogger(__name__ + "." + self.version)

    def handle_text_hesla(self):
        encapsulate(self.xml, "heslo/text_hesla", "odstavec")
        flatten(self.xml, "//slovni_druh/text_hesla")
        flatten(self.xml, "//text_hesla/text_hesla")

    def fix_apostrophe(self):  # TODO refactor replace_strings to handle cases like this
        for e in self.xml.xpath(".//String[@LANG='ocs']"):
            s = e.get("CONTENT", u"")
            s = s.replace(u'\u2019', u'\u2e2f')
            e.set("CONTENT", s)

    def run(self, files):
        for file_path in files:
            self.xml = et.parse(file_path)
            self.filter_records("String")
            replacements = {
                './/heslo': ['heslo', {'typ': 'hlavni', 'slovnik': 'SJS'}],
                './/odkazove_heslo': ['heslo', {'typ': 'odkazove', 'slovnik': 'SJS'}],
                './/odkazova_hlavicka': ['hlavicka', {}],
                './/*[starts-with(local-name(), "poznamka_")]': ['text_hesla', {}],
                './/indikator_odstavce': ['text_hesla', {}],
                './/text_odstavce': ['text_hesla', {}],
                './/nerozpoznano': ['text_hesla', {}],
                './/confer_odkaz': ['odkaz', {}],
            }
            self.update_elements(replacements)
            self.remove_elements(['//TAB'])
            self.handle_text_hesla()
            # self.fill_lang(xml)
            self.sweep_dangling_strings()
            self.add_alto_metadata()
            self.filter_records("alto")
            self.ensure_header()  # we need to run this after we rename the record element and add the metadata

            style_updates = {
                './/text_hesla/String[@ALPHABET="ru"]':  # rusky text je automaticky staroslovenstina pokud se nejedna o rusky preklad
                    ['String', {'LANG': 'ocs'}],
                './/zahlavi/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ocs',
                                'FONTSIZE': 'big',
                                'STYLE': 'bold',
                                }],
                './/zahlavi/gramaticke_zarazeni/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ocs',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/nasobne_zahlavi/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ocs',
                                'FONTSIZE': 'big',
                                'STYLE': 'bold',
                                }],
                './/nasobne_zahlavi/gramaticke_zarazeni/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ocs',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/text_hesla/String':
                    ['String', {'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/vyskyt//String':  # veskery text v casti vyskyt ma byt maly
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'la',
                                'FONTSIZE': 'small',
                                'STYLE': 'normal',
                                }],
                './/cestina/la_preklad/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'cs',
                                'FONTSIZE': 'normal',
                                'STYLE': 'italics',
                                }],
                './/rustina/ru_preklad/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ru',
                                'FONTSIZE': 'normal',
                                'STYLE': 'italics',
                                }],
                './/nemcina/la_preklad/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'de',
                                'FONTSIZE': 'normal',
                                'STYLE': 'italics',
                                }],
                './/rectina/el_preklad/String':
                    ['String', {'ALPHABET': 'el',
                                'LANG': 'el',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/latina/latina_predloha/la_preklad/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'la',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/latina/latina_preklad/la_preklad/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'la',
                                'FONTSIZE': 'normal',
                                'STYLE': 'italics',
                                }],
                './/staronemcina/la_preklad/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'goh',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/odkaz/String':
                    ['String', {'ALPHABET': 'ru',
                                'LANG': 'ocs',
                                'FONTSIZE': 'normal',
                                'STYLE': 'normal',
                                }],
                './/exh/String':
                    ['String', {'ALPHABET': 'la',
                                'LANG': 'la',
                                'FONTSIZE': 'small',
                                'STYLE': 'normal',
                                }],
            }
            self.update_elements(style_updates)

            small_words = ['add.', 'Cf.', 'cf.', 'erronee', 'Exh.', 'fig.', 'It.', 'lacuna', 'liturg.', 'mutilatum',
                           'neg.', 'om.', 'refl.', 'sc.', 'sic!', 'sim.', 'var.', 'Vg.', '(dat.)', '(gen.)', '(pl.)',
                           '(sic!)', 'in hoc sensu exh.', 'per errorem', '(It.)', '(Vg.)']
            self.update_attributes_on_content(small_words, {'FONTSIZE': 'small'})

            self.handle_indices()
            self.fix_apostrophe()
            self.fix_hyphens()

            self.replace_occurrit()

            output_path = file_path[:-4] + "-postprocessed.xml"
            savexml(self.xml, output_path)

            # save records to different files
            self.save_records(padding=2)
            self.save_errorneous_records()
            self.log_statistics()
        return files

    def pair_strings(self, element_list, pairs, attributes):
        for e1, e2 in nwise(element_list, n=2, fillvalue=et.Element("None")):
            c1 = e1.get("CONTENT")
            c2 = e2.get("CONTENT")
            if (c1, c2) in pairs:
                e2.attrib.update(attributes)
                p1 = e1.getparent()
                p2 = e2.getparent()
                p1.extend(p2)
                p2.getparent().remove(p2)

    def handle_indices(self):
        element_list = self.xml.xpath("//pamatka/String")
        sup_index_pairs = [
            (u'Zogr', u'b'),
            (u'Nik', u'a'),
            (u'Nik', u'b'),
            (u'Mosk', u'a'),
            (u'Mosk', u'b'),
            (u'Kij', u'b'),
            (u'SlužSof', u'a'),
            (u'SlužSof', u'b'),
            (u'SlužSof', u'c')
        ]
        self.pair_strings(element_list, sup_index_pairs, {"aip-index": 'horni'})

        sub_index_pairs = [
            (u'As', u'bis'),
            (u'Sav', u'bis'),
            (u'Ostr', u'bis'),
            (u'En', u'bis'),
            (u'Ochr', u'bis'),
            (u'Slepč', u'bis'),
            (u'Mak', u'bis'),
            (u'Šiš', u'bis'),
            (u'Grig', u'bis'),
            (u'Zach', u'bis'),
            (u'Lobk', u'bis'),
            (u'As', u'ter'),
            (u'Sav', u'ter'),
            (u'Ostr', u'ter'),
            (u'En', u'ter'),
            (u'Ochr', u'ter'),
            (u'Slepč', u'ter'),
            (u'Mak', u'ter'),
            (u'Šiš', u'ter'),
            (u'Grig', u'ter'),
            (u'Zach', u'ter'),
            (u'Lobk', u'ter')
        ]
        self.pair_strings(element_list, sub_index_pairs, {'aip-index': 'dolni'})

        for e1, e2 in nwise(self.xml.xpath("//text_hesla/String"), 2, et.Element("None")):
            c1 = e1.get("CONTENT")
            c2 = e2.get("CONTENT")
            if (c1, c2) in sup_index_pairs:
                e2.set("aip-index", "horni")
            if (c1, c2) in sub_index_pairs:
                e2.set("aip-index", "dolni")

    def replace_occurrit(self):
        '''
        Replace "occurrit in" with "occurring in".
        https://basecamp.com/2082305/projects/11816432/todos/303176561
        '''
        for e in self.xml.xpath("//vyskyt/text_hesla/String"):
            if e.get("CONTENT", "") == "occurrit in":
                e.set("CONTENT", "occurring  in")
