// Gorazd Generator
// Generator of dictionary entries from ALTO XML.
// Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

grammar SJSRecord;
// simplified version of SJSDictionary that parses only one record
slovnik: heslo
       | odkazove_heslo
       ;
text_hesla: (slovo | ',' | ';' | '-')+?;
nerozpoznano: .+?;

// ODKAZOVE HESLO
odkazove_heslo: BIGSKIP? odkazova_hlavicka nerozpoznano?;
odkazova_hlavicka: zahlavi text_hesla? 'v.' text_hesla? odkaz* text_hesla? nerozpoznano?;
odkaz: (ru_SLOVO | RU_BIG)+;

// HESLO
heslo: BIGSKIP? hlavicka vyskyt? odstavec+ exh? confer* text_hesla? nerozpoznano?;

hlavicka: zahlavi slovni_druh? nerozpoznano?
        | zahlavi nasobne_zahlavi slovni_druh? nerozpoznano?
        ;

zahlavi: (RU_BIG | RU_NORMAL) gramaticke_zarazeni*;
nasobne_zahlavi: ( (RU_BIG | RU_NORMAL) | ',')+ gramaticke_zarazeni+
               | ((RU_BIG | RU_NORMAL) ',' gramaticke_zarazeni)+
               ;
gramaticke_zarazeni: ','? '-' (ru_SLOVO|RU_BIG);
slovni_druh: SLOVNI_DRUH (SLOVNI_DRUH | ',' | TAB)*;

vyskyt: TAB ('occurrit in' | 'occurrit') (pamatka | text_hesla)*;
pamatka: la_SLOVO; // TODO viceslovne pamatky a podobne lahudky + overovani oproti seznamu --> v postprocesu

//odstavec: indikator_odstavce moderni_jazyky?  stare_jazyky?  (':'|';') text_odstavce? confer?;
odstavec: indikator_odstavce moderni_jazyky ':' text_odstavce? confer? ';'?
        | indikator_odstavce moderni_jazyky ';' stare_jazyky text_odstavce? confer? ';'?
        ;
indikator_odstavce: (TAB | '\u2666' | '\u26ab' | '-' ) poznamka_indikator?;  // puntik u25CF = ●, hrebik u2666 = ♦
poznamka_indikator: ((RU_NORMAL | LA_NORMAL)+? ':'?);
text_odstavce: (slovo | ',' | ':' | ';' | TAB | '-')+; // TAB se muze vyskytovat v Nota: a nektere odstavce neobsahuji preklady a tudiz je zahrnujeme do textu

moderni_jazyky: cestina ';' rustina ';' nemcina;
cestina: la_preklad (',' la_preklad)*;
rustina: ru_preklad (',' ru_preklad)*;
nemcina: la_preklad (',' la_preklad)*;

stare_jazyky: '-' '-'? poznamka_stare_jazyky? rectina ';' latina (';' staronemcina)?
            | '-'? '-'? latina rectina?// pravidlo 3.f
            ;
poznamka_stare_jazyky: (RU_NORMAL | LA_NORMAL)+ ':'? ; // TODO overit

rectina: el_preklad (',' el_preklad)*;
latina: (latina_predloha | latina_slovnik | ',')+;
latina_predloha: latina_predloha_preklad (',' latina_predloha_preklad)*;
latina_slovnik: latina_slovnik_preklad (',' latina_slovnik_preklad)*;

staronemcina: la_preklad (',' la_preklad)*;

latina_predloha_preklad: LA_NORMAL+ ;
latina_slovnik_preklad: LA_ITALIKA+ ;

// slova v prekladech obcas konci pomlkou
ru_preklad: (slovo | '-')+;
la_preklad: (slovo | '-')+;
el_preklad: slovo+;

slovo: ru_SLOVO | el_SLOVO | la_SLOVO;

ru_SLOVO: RU_NORMAL | RU_ITALIKA;
el_SLOVO: EL_NORMAL | EL_ITALIKA;
la_SLOVO: LA_NORMAL | LA_ITALIKA;

exh: ('-' | TAB) 'Exh.' ;
confer: TAB ('Cf.' | 'CL' ) (confer_odkaz | ',')+ // samostatny confer
      | '-' 'cf.' (confer_odkaz | ',')+ // odstavcovy confer
      ;
confer_odkaz: slovo (slovo | '-')*;

//                                               LEXER RULES

// _NORMAL is actually anything other than italics
// LA_ refers to all other alphabets than RU and EL
RU_BIG: '{' [:a-z][:a-z] '\t' JINY_STYL '\t' 'big' .*? '}'; // the only big string is the header and OCR sometimes doesn't fill the LANG attribute
RU_ITALIKA: '{ru'' \t' 'italics' ZBYTEK_SLOVA;
RU_NORMAL: '{ru' '\t' JINY_STYL ZBYTEK_SLOVA;
EL_ITALIKA: '{el' '\t' 'italics' ZBYTEK_SLOVA;
EL_NORMAL: '{el' '\t'  JINY_STYL ZBYTEK_SLOVA;
LA_ITALIKA: '{' JINY_JAZYK '\t' 'italics' ZBYTEK_SLOVA;
LA_NORMAL: '{' JINY_JAZYK '\t' JINY_STYL ZBYTEK_SLOVA;

// ANTLR doesn't allow negation of more than one token / character
// we  chose to list all possibilities in order to keep the resulting parse tree cleaner
fragment
JINY_JAZYK: 'cs' | 'de' | 'la' | ; // we must allow empty match
fragment
JINY_STYL: 'italics' | 'bold' | 'smallcaps' | 'subscript' | 'supscript' | ; // we must allow empty match

fragment
ZBYTEK_SLOVA: .*? '}';
//ZBYTEK_SLOVA: '|' FONTSIZE '|' CONTENT '|' POZICE '|' WC '|' IMG_ID '|' FONT '}';
//['LANG', 'STYLE', 'FONTSIZE', 'CONTENT', 'HPOS', 'VPOS', 'WIDTH', 'HEIGHT', 'WC', 'IMG_ID', 'FONT' ]

fragment
FONTSIZE: 'normal' | 'small' | 'big';
SLOVNI_DRUH: 'm.'
           | 'f.'
           | 'n.'
           | 'adj.'
           | 'poss.'
           | 'pron.'
           | 'num.'
           | 'ipf.'
           | 'pf.'
           | 'praep.'
           | 'conj.'
           | 'part.'
           | 'interj.'
           | 'ptc.'
//            | ('m'|'f') '\n.,\n' 'nom propr.'
           | 'nom.'
           | 'propr.'
           ;
TAB: 'TAB';
BIGSKIP: 'BIGSKIP';

WS: '\n' -> skip;

//ErrorCharacter: . ;