# -*- coding: utf-8 -*-

# Gorazd Generator
# Generator of dictionary entries from ALTO XML.
# Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import codecs
import logging
import os
import re
from PIL import Image, ImageDraw
from abc import abstractmethod, ABCMeta
from collections import Counter
from functools import partial
from itertools import ifilter, izip
from lxml import etree as et

from os.path import join

from utils import create_path, nwise, replace_strings, fix_greek, join_xml, savexml, word2alto

DEBUG_INSERT_BIGSKIP = False  # insert lines into image based on coordinates from ALTO XML


class Preprocessor(object):
    __metaclass__ = ABCMeta

    def __init__(self, output_dir='out', intermediate=''):
        self.ns = {u'alto': u'http://www.loc.gov/standards/alto/ns-v3#'}
        self.output_dir = output_dir
        create_path(self.output_dir)
        if not intermediate:
            self.intermediate = join(self.output_dir, 'intermediate/')
        else:
            self.intermediate = intermediate
        create_path(self.intermediate)
        self.xml = et.Element("None")
        self.log = logging.getLogger(__name__)
        self.log.info("Preprocessor initialized")

    @abstractmethod
    def run(self, files):
        pass

    def fill_page_id(self):
        """
        Each String element needs to remeber on which page it was found.
        """
        self.log.info("Filling in page_id")
        for file in self.xml:
            id = os.path.split(file.get('filename'))[1][:-4]
            for e in file.xpath(".//alto:String", namespaces=self.ns):
                e.set("PAGE_ID", id)

    def normalize_fontstyle(self):
        self.log.info("Normalizing font style")
        for file in self.xml:
            for e in file.xpath(".//alto:String", namespaces=self.ns):
                c = e.get("STYLE", "")
                n = c.replace("bold", "").strip()
                e.set("STYLE", n)

    def fill_alphabet(self):
        '''
        Fills in ALPHABET tag for all Strings in xml.
        It is determined on the content of LANG and if that is empty, then we guess based on CONTENT
        '''
        self.log.info("Filling in ALPHABET")
        ar = {
            'ru': re.compile(r'[\[\]()\u0400-\u04FF\u2DE0-\u2DFF\uA640-\uA69F\u2C00-\u2C5F]', re.UNICODE),
            'el': re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]', re.UNICODE),
            'la': re.compile(r'[\u0000-\u007f\u0080-\u00ff\u0100-\u017f]', re.UNICODE)
        }

        lang2alph = {
            'la': 'la',
            'cs': 'la',
            'de': 'la',
            'en': 'la',
            'el': 'el',
            'ru': 'ru',
            'ocs': 'ru'
        }
        for e in self.xml.xpath(".//alto:String", namespaces=self.ns):
            content = e.get("CONTENT", "").strip(' -,;:.')
            matches = {}
            if content:
                for l in ar:
                    matches[l] = float(len(ar[lang2alph[l]].findall(content))) / len(content)
                l = max(matches, key=matches.get)  # returns the key whose values is maximal
                # self.log.debug("Content: '%s' of length %d"% (content, len(content)))
                # self.log.debug(matches)
                if matches[l] > 0.5:  # TODO tune this parameter
                    e.set("ALPHABET", lang2alph[l])
                else:
                    lang = e.get("LANG")
                    if lang:
                        e.set("ALPHABET", lang2alph[lang])
                    else:
                        e.set("ALPHABET", 'la')  # default alphabet is latin


class RSIPreprocessor(Preprocessor):
    def __init__(self, output_dir='out', intermediate=''):
        self.version = "RSI.1"
        self.log = logging.getLogger(__name__ + "." + self.version)
        super(RSIPreprocessor, self).__init__(output_dir, intermediate)

    def join_words(self):
        """
        The output of MS Word is not ideal and we need to join consecutive elements which together form one word.
        """
        self.log.info("Joining words")
        xml = self.xml
        string_tag = "{{{alto}}}String".format(alto=self.ns['alto'])

        # join consecutive strings if they have same style and alphabet and the second one is not an sub/sup index
        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if b.get('aip-index') is None and \
                            a.tag == string_tag and b.tag == string_tag and \
                            a.get("STYLE") == b.get("STYLE") and \
                            a.get("ALPHABET") == b.get("ALPHABET"):
                tmp.append([a, b])
        for (a, b) in reversed(tmp):
            a.set("CONTENT", a.get("CONTENT") + b.get("CONTENT"))
            if b.get("FONTSIZE") == "big":
                a.set("FONTSIZE", 'big')
            if b.get("STYLE"):
                a.set("STYLE", b.get("STYLE"))
            b.getparent().remove(b)

        # join the plus symbol
        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if a.tag == string_tag and b.tag == string_tag and \
                            a.get("CONTENT") == "+":
                tmp.append([a, b])
        for (a, b) in reversed(tmp):
            b.set("CONTENT", "+" + b.get("CONTENT"))
            a.getparent().remove(a)

        # join round brackets
        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if a.tag == string_tag and b.tag == string_tag and \
                            a.get("CONTENT") == "(":
                tmp.append([a, b])
        self.log.debug("Left round brackets left to join: %d" % len(tmp))
        for (a, b) in reversed(tmp):
            b.set("CONTENT", a.get("CONTENT") + b.get("CONTENT"))
            a.getparent().remove(a)
        # print(len(tmp))

        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if a.tag == string_tag and b.tag == string_tag and \
                            b.get("CONTENT") == ")":
                tmp.append([a, b])
        self.log.debug("Right round brackets left to join: %d" % len(tmp))
        for (a, b) in reversed(tmp):
            a.set("CONTENT", a.get("CONTENT") + b.get("CONTENT"))
            b.getparent().remove(b)
        # print(len(tmp))

        # throw out word separators WS
        for e in xml.iter("WS"):
            e.getparent().remove(e)

        # in case we missed joining some square brackets
        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if a.get("CONTENT", "") == "[" and b.tag == string_tag:
                tmp.append([a, b])
        self.log.debug("Left square brackets left to join: %d" % len(tmp))
        for (a, b) in reversed(tmp):
            b.set('CONTENT', "[" + b.get("CONTENT"))
            #    print(b.tag, b.attrib)
            a.getparent().remove(a)

        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if b.get("CONTENT", "") == "]" and a.tag == string_tag:
                tmp.append([a, b])
        self.log.debug("Right square brackets left to join: %d" % len(tmp))
        for (a, b) in reversed(tmp):
            a.set('CONTENT', a.get("CONTENT") + "]")
            #    print(a.tag, a.attrib)
            b.getparent().remove(b)

        # join special upper indices, see https://basecamp.com/2082305/projects/11816432/todos/297113522

        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if b.get("CONTENT", "None")[0] in u"¹²³" and a.tag == string_tag:
                tmp.append([a, b])
        for (a, b) in reversed(tmp):
            bcontent = b.get("CONTENT")
            a.set('CONTENT', a.get("CONTENT") + bcontent[0])
            if len(bcontent) > 1:
                b.set("CONTENT", bcontent[1:])
            else:
                b.getparent().remove(b)

        # join + for old church slavonic headers
        tmp = []
        for a, b in nwise(xml.iter(), fillvalue=et.Element("None")):
            if a.get("CONTENT", "None") in u"+" and b.tag == string_tag:
                tmp.append([a, b])
        for (a, b) in reversed(tmp):
            b.set('CONTENT', a.get("CONTENT", "") + b.get("CONTENT", ""))
            #    print(a.tag, a.attrib)
            a.getparent().remove(a)

    def run(self, files):
        self.log.info("Preprocessing the following files:\n %s" % files)
        files = map(partial(word2alto, self.intermediate), files)
        self.xml = join_xml(self.intermediate, files)
        self.fill_alphabet()
        self.join_words()
        # self.normalize_fontstyle()
        self.fill_page_id()
        output_path = join(self.intermediate, 'joined-files-preprocessed.xml')
        self.log.info("Writing output to %s" % output_path)
        savexml(self.xml, output_path)
        return [output_path]


class SJSPreprocessor(Preprocessor):
    def __init__(self, output_dir='out', intermediate=''):
        self.version = "SJS.1"
        self.log = logging.getLogger(__name__ + "." + self.version)
        self._init_char_fixers()
        super(SJSPreprocessor, self).__init__(output_dir, intermediate)

    def _init_char_fixers(self):
        """
        This is where we store the substitution tables in order not to clutter __init__.
        TODO: read these tables from external file?
        """
        self.CHARS = {
            # SLAVONIC CHARS
            u'\u0023': u'\uA64B',  # u'#': u'ꙋ',
            u'\u0024': u'\uA645',  # u'$': u'ꙅ',
            u'\u0025': u'\uA647',  # u'%': u'ꙇ',
            u'\u0026': u'\uA659',  # u'&': u'ꙙ',
            u'\u0040': u'\uA657',  # u'@': u'ꙗ',
            u'\u005F': u'\u0483',  # u'_': u'҃',
            u'\u007E': u'\u0482',  # u'~': u'҂',
            u'\u00A3': u'\u046F',  # u'£': u'ѯ',
            u'\u00A5': u'\u0471',  # u'¥': u'ѱ',
            u'\u20AC': u'\u0467',  # u'€': u'ѧ',
            u'\u00B1': u'\u0465',  # u'±': u'ѥ',
            u'\u00A7': u'\u047F',  # u'§': u'ѿ',
            u'\u00A9': u'\u2C51',  # u'©': u'ⱑ',
            u'\u00AE': u'\u0469',  # u'®': u'ѩ',
            u'\u003C': u'\u2039',  # u'<': u'‹',
            u'\u003E': u'\u203A',  # u'>': u'›',
            u'\u00AB': u'\u0461',  # u'«': u'ѡ',
            u'\u00BB': u'\u046B',  # u'»': u'ѫ',
            u'\u25A1': u'\u0481',  # u'□': u'ҁ',
            u'\u25B2': u'\u046D',  # u'▲': u'ѭ',
            u'\u00B0': u'\u0484',  # u'°': u'҄',
            u'\u2122': u'\uA651',  # u'™': u'ꙑ',
            u'\u25A0': u'\uA643',  # u'■': u'ꙃ',
            u'\u25B3': u'\u0457',  # u'△': u'ї',
            u'\u25BC': u'\u017F',  # u'▼': u'ſ',
            u'\u25BD': u'\u0107',  # u'▽': u'ć',
            u'\u25BB': u'\u00D7',  # u'▻': u'×',
            u'\u25BA': u'\u045B',  # u'►': u'ћ',
            u'\u25C4': u'\u0455',  # u'◄': u'ѕ',
            # u'\u25CE': u'\u25CF', #  u'◎': u'●',
            u'\u25CE': u'\u26AB',  # u'◎': u'⚫',
            #            u'\u2014': '-'  # nahrazeni dlouhe pomlcky kratkou
        }
        self.WORDS = {
            # památky
            u'Bnd': u'Bud',
            u'CanMls': u'CanMis',
            u'Cbii': u'Chil',
            u'Cbll': u'Chil',
            u'Cbrist': u'Christ',
            u'Cbrlst': u'Christ',
            u'Encb': u'Euch',
            u'Eng': u'Eug',
            u'Gchr': u'Ochr',
            u'Grlg': u'Grig',
            u'Hvai': u'Hval',
            u'Lndm': u'Ludm',
            u'Loh': u'Lob',
            u'Nanm': u'Naum',
            u'Nlk': u'Nik',
            u'Ocbr': u'Ochr',
            u'Rnmj': u'Rumj',
            u'Sln': u'Sin',
            u'Slnck': u'Sluck',
            u'Slnž': u'Služ',
            u'Snpr': u'Supr',
            u'Šlš': u'Šiš',
            u'Zaeh': u'Zach',
            # jiné
            u'(gem)': u'(gen.)',
            u'(geu.)': u'(gen.)',
            u'suh': u'sub',
            u'urhs': u'urbs',
            u'nnnc': u'nunc',
            u'Zacb': u'Zach',
            u'Snd': u'Sud',
            u'Pocbv': u'Pochv',
            u'Ciem': u'Clem',
            u'Cioz': u'Cloz',
            u'Stepč': u'Slepč',
            u'Slcpč': u'Slepč',
            u'Sicpč': u'Slepč',
            u'Psail': u'Psalt',
            u'VcncNik': u'VencNik',
            u'Eucb': u'Euch',
            u'Coust': u'Const',
            u'Chrlst': u'Christ',
            u'CMLah': u'CMLab',
            u'scrihitur': u'scribitur',
            u'Napls': u'Napis',
            u'Kiim': u'Klim',
            u'aiiter': u'aliter',
            u'ailter': u'aliter',
            u'allter': u'aliter',
            u'bebraica': u'hebraica',
            u'Exb.': u'Exh.',
            u'Exdr.': u'Exh.',
            u'ging.': u'glag.',
            u'iegitnr': u'legitur',
            u'incertnm': u'incertum',
            u'legitnr': u'legitur',
            u'legltur': u'legitur',
            u'lelgltnr': u'legitur',
            u'lncertnm': u'incertum',
            u'mniier': u'mulier',
            u'mniler': u'mulier',
            u'mnlier': u'mulier',
            u'mnller': u'mulier',
            u'muiler': u'mulier',
            u'muller': u'mulier',
            u'nnlns': u'unius',
            u'nuins': u'unius',
            u'obilt': u'obiit',
            u'oblit': u'obiit',
            u'occnrrit': u'occurrit',
            u'occnrrlt': u'occurrit',
            u'occurrlt': u'occurrit',
            u'Ocetio': u'Lectio',
            u'ohiit': u'obiit',
            u'ohilt': u'obiit',
            u'ohlit': u'obiit',
            u'Pbrygia': u'Phrygia',
            u'scriptnm': u'scriptum',
            u'secnndum': u'secundum',
            u'sensn': u'sensu',
            u'snb': u'sub',
            u'snbiernnt': u'subierunt',
            u'snbiit': u'subiit',
            u'snbilt': u'subiit',
            u'snblit': u'subiit',
            u'snbllt': u'subiit',
            u'snh': u'sub',
            u'sublit': u'subiit',
            u'subllt': u'subiit',
            u'substituendns': u'substituendus',
            u'nnns': u'unus',
            u'unns': u'unus',
            u'uuius': u'unius',
            # části slov
            u'cbristian': u'christian',
            u'ibns': u'ibus',
            u'icns': u'icus',
            u'ihns': u'ibus',
            u'ihus': u'ibus',
            u'mntilat': u'mutilat',
            u'plnr': u'plur',
            u'propbet': u'prophet',
            u'tnr': u'tur',
            # treti davka
            u'adb': u'adj.',
            u'Apqc': u'Apoc',
            u'Bcn': u'Bon',
            u'Bnmj': u'Rumj',
            u'Bumj': u'Rumj',
            u'CauMis': u'CanMis',
            u'Cluz': u'Cloz',
            u'CMNav': u'CMNov',
            u'Cunst': u'Const',
            u'Ench': u'Euch',
            u'Febrnarii': u'Februarii',
            u'fiiius': u'filius',
            u'Gctobris': u'Octobris',
            u'gracca': u'graeca',
            u'Gstr': u'Ostr',
            u'ipL': u'ipf.',
            u'kaleud.': u'kalend.',
            u'liugua': u'lingua',
            u'lpf.': u'ipf.',
            u'Octohris': u'Octobris',
            u'Pcg': u'Pog',
            u'semei': u'semel',
            u'Siepč': u'Slepč',
            u'Siuž': u'Služ',
            u'Sohol.': u'Sobol.',
            u'Stojanovič': u'Stojanović',
            u'Zcgr': u'Zogr',
            u'ъі': u'ꙑ',  # (U+A651)
            # ctvrta davka
            u'(iat.)': u'(lat.)',
            u'Acbr': u'Achr',
            u'Bcs': u'Bes',
            u'Bcu': u'Bon',
            u'Dcč': u'Deč',
            u'Chrahr': u'Chrabr',
            u'ocbr': u'Ochr',
            u'occIIrrit': u'occurrit',
            u'ostr': u'Ostr',
            u'Psait': u'Psalt',
            u'Septemhris': u'Septembris',
            u'zogr': u'Zogr',
            # pata davka
            u'8upr': u'Supr',
            u'Beu': u'Ben',
            u'Cbiland': u'Chiland',
            u'Ccnst': u'Const',
            u'Clcz': u'Cloz',
            u'Eavrov': u'Lavrov',
            u'exb.': u'exh.',
            u'Exil.': u'Exh.',
            u'gcn.': u'gen.',
            u'Iij': u'Ilj',
            u'occIlrrit': u'occurrit',
            u'occllrrit': u'occurrit',
            u'ochr': u'Ochr',
            u'Os1r': u'Ostr',
            u'Rcs': u'Bes',
            u'SIIpr': u'Supr',
            u'Sill': u'Sin',
            u'SIlpr': u'Supr',
            u'Siu': u'Sin',
            u'Siuck': u'Sluck',
            u'vajs': u'Vajs',
            u'vcncNik': u'VencNik',
            u'venc': u'Venc',
            u'vencNik': u'VencNik',
            u'Zcb': u'Zch',
            u'έπꙇ҃': u'ἐπὶ',
            # SJS II
            u'(lf.)': u'(It.)',
            u'(vg.)': u'(Vg.)',
            u'4Bg': u'4Rg',
            u'absof.': u'absol.',
            u'aee.': u'acc.',
            u'B0s': u'Bes',
            u'baec': u'haec',
            u'Bau': u'Bon',
            u'Bou': u'Bon',
            u'Bum': u'Rum',
            u'Eneh': u'Euch',
            u'Eueh': u'Euch',
            u'Exln': u'Exh.',
            u'Gcbr': u'Ochr',
            u'Gcc.': u'Occ.',
            u'Gr.': u'Or.',
            u'Griente': u'Oriente',
            u'Hah': u'Hab',
            u'Hh': u'Hb',
            u'Hiif': u'Hilf',
            u'his': u'bis',
            u'christ': u'Christ',
            u'iuuctio': u'iunctio',
            u'Jagič': u'Jagić',
            u'KlimBum': u'KlimRum',
            u'Leb': u'Lob',
            u'Lchk': u'Lobk',
            u'lpL': u'ipf.',
            u'Lub': u'Lob',
            u'Luh': u'Lob',
            u'mcmoria': u'memoria',
            u'Metb': u'Meth',
            u'Nieod': u'Nicod',
            u'Nlccd': u'Nicod',
            u'occmrit': u'occurrit',
            u'occurnt': u'occurrit',
            u'oecurrit': u'occurrit',
            u'oeenrrit': u'occurrit',
            u'Oehr': u'Ochr',
            u'Pag': u'Pog',
            u'Pcchv': u'Pochv',
            u'pL': u'pf.',
            u'rcfi.': u'refl.',
            u'refi.': u'refl.',
            u'Res': u'Bes',
            u'Reu': u'Ben',
            u'Ron': u'Bon',
            u'seribitur': u'scribitur',
            u'seusu': u'sensu',
            u'siš': u'Šiš',
            u'sobol.': u'Sobol.',
            u'Sobor': u'Sobol.',
            u'Stojauovič': u'Stojanović',
            u'subsf.': u'subst.',
            u'supr': u'Supr',
            u'Syncd': u'Synod',
            u'Synud': u'Synod',
            u'tantnm': u'tantum',
            u'tautum': u'tantum',
            u'Tnn': u'Tun',
            u'Tuu': u'Tun',
            u'vaš.': u'Vaš.',
            u'vcl': u'vel',
            u'VeneNik': u'VencNik',
            u'veucNik': u'VencNik',
            u'VeucNov': u'VencNov',
            u'vit': u'Vit',
            u'Zaeb': u'Zach',
            u'Zagr': u'Zogr',
            u'Zeh': u'Zch',
            u'Zugr': u'Zogr',
            u'εꙇ҃ς': u'εἰς',
            u'καꙇ҃': u'καὶ',
            u'ꙇ҃να': u'ἵνα',
        }

    def join_lines(self):  # TODO handle breaks between block and files
        """
        Undo line breaks. I.e. join words which are split by a line break.
        """
        self.log.info("Joining lines")

        def join_elements(e1, e2, joined_word):
            '''
            Joins two String elements putting joined_word as the resulting content.
            Returns parents of these elements so we can fuse them later.  (Fusing them now would mess with iteration.)
            '''
            e2.set('CONTENT', joined_word)
            e1w, e1h = int(e1.get("WIDTH")), int(e1.get("HEIGHT"))
            e2w, e2h = int(e2.get("WIDTH")), int(e2.get("HEIGHT"))
            e2.set("HEIGHT", str(e2h + e1h))
            e2.set("WIDTH", str(e2w + e1w))
            tl1, tl2 = e1.getparent(), e2.getparent()
            tl1.remove(e1)
            return tl1, tl2

        def test_and_join(s1, s2):
            """
            Test whether elements s1 and s2 (which can be in different TextBlock / File) can be joined.
            If yes, then returns joined word, join_type, otherwise None, None
            :param s1: lxml.etree._Element
            :param s2: lxml.etree._Element
            :return: (unicode, str)
            """

            def join_words(w1, w2):
                '''
                Join two strings if the first one ends with '-'.
                '''
                special_chars = [u'›', u">", u")"]
                if w1 and w2 and len(w1) > 1 and w1[-1] == "-" and (w1[-2].isalpha() or w1[-2] in special_chars):
                    self.log.debug("Joining %s with %s" % (w1, w2))
                    return w1[:-1] + w2

            joined_word, join_type = None, None
            if s1.get("SUBS_TYPE") == "HypPart1":
                joined_word = s2.get("SUBS_CONTENT")  # s1, s2 can differ, s2 more reliable?
                join_type = "OCR"
            else:
                # test whether s1 is the last one on the line and s2 is the first one
                tl1 = s1.getparent()
                last = tl1[-1] if len(tl1) > 0 else None
                tl2 = s2.getparent()
                first = tl2[0] if len(tl2) > 0 else None
                if last is not None and first is not None and s1 is last and s2 is first:
                    joined_word = join_words(s1.get("CONTENT"), s2.get("CONTENT"))
                    join_type = "HEUR"
            return joined_word, join_type

        joins = []

        # if memory consumption becomes a problem, we will need to rewrite this into a while loop
        elements = list(enumerate(self.xml.iter()))

        # iterate over consecutive pairs of record separators and remember their position in the XML tree
        for (n1, tb1), (n2, tb2) in nwise(ifilter(lambda (n, e): "BIGSKIP" in e.tag, elements), fillvalue=(None, None)):
            # iterate over consecutive pairs of strings that are between these record separators
            for (ns1, s1), (ns2, s2) in nwise(ifilter(lambda (n, e): "String" in e.tag, elements[n1:n2]),
                                              fillvalue=(None, None)):
                if s1 is not None and s2 is not None:
                    joined_word, join_type = test_and_join(s1, s2)
                    if joined_word:
                        tl1, tl2 = join_elements(s1, s2, joined_word)
                        hyp = s1.getnext()
                        if hyp is not None and "HYP" in hyp.tag:
                            tl1.remove(hyp)
                        joins.append((tl1, tl2, join_type))

        counter = Counter()
        # in case there are two splits after each other we have to work from the back
        while joins:
            tl1, tl2, join_type = joins.pop()
            assert "TextLine" in tl1.tag and "TextLine" in tl2.tag, "Expected to join two lines! \n %s \n %s" % (
                tl1.attrib, tl2.attrib)
            tl1.extend(tl2)
            p = tl2.getparent()
            if p is not None:
                p.remove(tl2)
            counter[join_type] += 1
        self.log.debug("We have joined %d lines based on info from OCR." % counter['OCR'])
        self.log.info("We have joined %d lines based on heuristics." % counter['HEUR'])

    def fix_characters(self):
        """
        Correcting OCR errors.
        """
        self.log.info("Applying word/character substitutions")
        replace_strings(self.xml, self.WORDS, self.CHARS)

        # special replacements at the end of words
        def fix_ending(ending, replacement):
            for e in self.xml.xpath(u".//alto:String[contains(@CONTENT, '%s')]" % ending,
                                    namespaces=self.ns):
                c = e.get("CONTENT")
                s = c.strip(" ,.:!")
                o_s = s
                if s.endswith(ending):
                    s = s[:-len(ending)] + replacement
                e.set("CONTENT", c.replace(o_s, s))

        e_subs = {
            u'ннн': u'нии',
            u'нн': u'ни',
            u'пꙗ': u'иꙗ'
        }
        for ending, replacement in e_subs.items():
            self.log.debug("Replacing %s at wordends with %s" % (ending, replacement))
            fix_ending(ending, replacement)

        # fixing "occurrit iu" and "boc sensn"
        self.log.debug("Fixing 'occurrit iu' and 'boc sensn'")
        for e1, e2 in nwise(self.xml.xpath(".//alto:String", namespaces=self.ns), n=2,
                            fillvalue=et.Element("None", attrib={"CONTENT": ""})):
            e1c = e1.get("CONTENT")
            e2c = e2.get("CONTENT")
            if "occurrit" in e1c:
                if u"iu" in e2c or u'm' in e2c:
                    e2.set("CONTENT", "in")
            if 'boc' in e1c:
                if 'sensn' in e2c:
                    e1.set("CONTENT", 'hoc')
                    e2.set("CONTENT", 'sensu')

        self.log.debug("Fixing greek")
        for file_xml in self.xml:
            xml_path = file_xml.get('filename')
            txt_path = xml_path[:-4] + '.txt'
            if not os.path.exists(txt_path):
                self.log.warning("File %s does not exists. Cannot fix greek for %s" % (txt_path, xml_path))
                break
            self.log.info("Fixing greek with %s" % txt_path)
            with codecs.open(txt_path, encoding='utf-8', mode='r') as f:
                txt = f.read()
            fix_greek(file_xml, txt)
        return self.xml

    def propagate_fontsize(self):
        """
        Inherit and save (normalized) fontsize information in string elements.
        """
        self.log.info("Propagating fontsize")

        def map_style_to_size(size):  # TODO tune these parameters
            if size < 8:
                return "small"
            elif size > 12:
                return "big"
            else:
                return "normal"

        for file in self.xml:
            # create a dict of available fontsizes
            fontsize = {}
            for ts in file.findall('.//alto:TextStyle', self.ns):
                fontsize[ts.get('ID')] = int(ts.get('FONTSIZE'))
            # propagate sizes to Strings
            for font in fontsize:
                for tl in file.xpath(".//alto:TextLine[contains(@STYLEREFS, '%s')]" % font, namespaces=self.ns):
                    for string in tl.xpath('.//alto:String', namespaces=self.ns):
                        fs = string.get("STYLEREFS", font)
                        string.set("STYLEREFS", fs)

            for string in file.xpath(".//alto:String", namespaces=self.ns):
                fs = string.get("STYLEREFS")
                if fs:
                    string.set("FONTSIZE", map_style_to_size(fontsize[fs]))
                else:
                    string.set("FONTSIZE", "normal")
        return self.xml

    def insert_tabs(self):
        """
        Insert special TAB Elements that represent indentation in the original printed dictionary.
        """
        self.log.info("Inserting TABs")
        for file in self.xml:
            for tb in file.xpath(".//alto:TextBlock", namespaces=self.ns):
                hpos_tb = int(tb.get("HPOS"))
                for tl in tb.xpath(".//alto:TextLine", namespaces=self.ns):
                    hpos_tl = int(tl.get("HPOS"))
                    if hpos_tl - hpos_tb > 60:  # TODO tune this parameter
                        tl.addprevious(et.Element("TAB"))

        # # unfortunately, we can't rely on OCR here
        # for file in xml:
        #     styles = map(lambda e: (e.get('ID'), float(e.get('FIRSTLINE'))) ,
        #                                                     file.xpath(".//alto:ParagraphStyle", namespaces=self.ns))
        #     styles = [e for e, s in styles if s > 8] # TODO tune this parameter
        #     for style in styles:
        #         for tl in xml.xpath(".//alto:TextLine[contains(@STYLEREFS, '%s')]" % style, namespaces=self.ns):
        #             tl.addprevious(et.Element("TAB"))
        return self.xml

    def insert_bigskips(self):
        """
        Insert special BIGSKIP Elements that represent bigger interparagraph space in the original printed dictionary.
        """
        self.log.info("Inserting BIGSKIPs")
        if DEBUG_INSERT_BIGSKIP:
            def put_line(draw, vpos, hpos, width, sx, sy, color='red'):
                path = list(map(int, (hpos / sx, vpos / sy, (hpos + width) / sx, vpos / sy)))
                # print(path)
                draw.line(path, fill=color)
                self.log.debug("Inserting line at: (%s, %s) with color %s" % (vpos, hpos, color))

            def get_scale_factors(xml, img):
                page = xml.xpath(".//alto:Page", namespaces=self.ns)[0]
                height = float(page.get("HEIGHT"))
                width = float(page.get("WIDTH"))
                return height / img.size[1], width / img.size[0]

        def insert_bigskip(l1, l2):
            s1 = l1[0]
            s2 = l2[0]
            v1 = int(s1.get("VPOS"))
            h1 = int(s1.get("HEIGHT"))
            v2 = int(s2.get("VPOS"))
            if v2 - (v1 + h1) > 90:  # TODO tune this parameter
                hpos1 = int(s1.get("HPOS"))
                hpos2 = int(s2.get("HPOS"))
                if hpos2 - hpos1 <= 30:  # TODO tune this parameter
                    if s2.get("ALPHABET") == "ru":
                        # self.log.debug(u"Inserting bigskip between:\n %s \n and \n %s" %
                        #                                                            (et.tostring(l1), et.tostring(l2)))
                        if DEBUG_INSERT_BIGSKIP:
                            put_line(draw, v1 + h1, hpos1, int(s2.get("WIDTH")), sx, sy, color="green")
                            put_line(draw, v2, hpos2, int(s2.get("WIDTH")), sx, sy)
                        return s2

        insert_points = []
        for file in self.xml:
            if DEBUG_INSERT_BIGSKIP:
                xml_fpath = file.get("filename")
                xml_dir = os.path.dirname(xml_fpath)
                xml_file = os.path.basename(xml_fpath)
                img_file = os.path.join(xml_dir, xml_file.replace("EX", "N3")[:-4] + ".jpg")

                img = Image.open(img_file)
                sx, sy = get_scale_factors(file, img)
                draw = ImageDraw.Draw(img)

            textlines = file.xpath(".//alto:PrintSpace//alto:TextLine", namespaces=self.ns)
            for pair in izip(textlines, textlines[1:]):  # iterate over consecutive pairs
                # try:
                insert_point = insert_bigskip(*pair)
                if insert_point is not None:
                    insert_points.append(insert_point)
                    # except TypeError as err:
                    #     self.log.warning(u"Not enough positioning information in:\n%s\n---------\n%s" %
                    #                                     (et.tostring(pair[0]).strip(), et.tostring(pair[1]).strip()))
                    #     self.log.debug(err)

            for tb in file.xpath(".//alto:PrintSpace//alto:TextBlock",
                                 namespaces=self.ns):  # check top elements of textblocks
                if len(tb) > 0:
                    first_line = tb[0]
                if len(first_line) > 0:
                    first_string = first_line[0]
                    size = first_string.get("FONTSIZE")
                    # alphabet = first_string.get("ALPHABET")
                    if size == "big":
                        # if alphabet == "ru":
                        insert_points.append(first_string)
                        if DEBUG_INSERT_BIGSKIP:
                            vpos = int(first_string.get("VPOS"))
                            hpos = int(first_string.get("HPOS"))
                            width = int(first_string.get("WIDTH"))
                            draw = ImageDraw.Draw(img)
                            put_line(draw, vpos + 10, hpos, width * 3, sx, sy, "blue")

            if DEBUG_INSERT_BIGSKIP:
                img.save(os.path.join(self.output_dir, os.path.basename(img_file)))
        for s in insert_points:
            s.addprevious(et.Element("BIGSKIP"))
        self.log.debug("Inserted %d record separators" % len(insert_points))

    def run(self, files):
        self.log.info("Preprocessing the following files:\n %s" % files)
        self.xml = join_xml(self.intermediate, files)
        self.propagate_fontsize()
        self.fill_alphabet()
        self.insert_bigskips()  # depends on correct alphabet
        self.join_lines()
        self.fix_characters()
        self.normalize_fontstyle()
        self.insert_tabs()
        self.fill_page_id()
        output_path = join(self.intermediate, 'joined-files-preprocessed.xml')
        self.log.info("Writing output to %s" % output_path)
        savexml(self.xml, output_path)
        return [output_path]
