// Gorazd Generator
// Generator of dictionary entries from ALTO XML.
// Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

grammar SNSPRecord;

slovnik: (heslo | odkazove_heslo)+;
text_hesla: (slovo | slovni_druh | ',' | FREKVENCE)+;
slovo: ru_slovo| el_slovo | LA_SLOVO;
nerozpoznano: .+?;

// ODKAZOVE HESLO
odkazove_heslo: RS PAR odkazova_hlavicka;
odkazova_hlavicka: odkazove_zahlavi (',' nasobne_odkazove_zahlavi)* text_hesla? SIPKA odkaz (',' odkaz)* text_hesla?;
odkaz: RU_OTHER+;
odkazove_zahlavi: (RU_BOLD | RU_OTHER)+;
nasobne_odkazove_zahlavi: (RU_BOLD | RU_OTHER)+;

// HESLO
heslo: RS hlavicka odstavec*;// nerozpoznano?;

//hlavicka: zahlavi sloveso? slovni_druhy? text_hesla?
//        | zahlavi nasobne_zahlavi slovni_druhy? text_hesla?
//        ;

hlavicka: zahlavi (',' nasobne_zahlavi | ','? gramaticke_zarazeni | ','? slovni_druh | hlavicka_text_hesla | frekvence)*;
//        | zahlavi (',' nasobne_zahlavi | slovni_druh | hlavicka_text_hesla)*;
//hlavicka: zahlavi ((',' nasobne_zahlavi | ',' sloveso)* | slovni_druhy)? hlavicka_text_hesla?;
hlavicka_text_hesla: (LA_SLOVO+ | ',')+;

gramaticke_zarazeni: RU_OTHER | INDECL;
odstavec: (text_hesla | SIPKA odkaz (',' odkaz)*)+;
zahlavi: PAR RU_BOLD_BIG+;
nasobne_zahlavi: RU_BOLD_BIG+;

//slovni_druhy: slovni_druh (slovni_druh | ',' | hlavicka_text_hesla)*;
slovni_druh: SLOVNI_DRUH+;

ru_slovo: RU_OTHER | RU_BOLD_BIG | RU_BOLD;
el_slovo: EL_SLOVO;
frekvence: FREKVENCE;

//                                               LEXER RULES
INDECL: 'indecl.';
FREKVENCE: '$la||normal|(' '\u227B'? [0-9]+ ')' .*? '$';

// _NORMAL is actually anything other than italics
// LA_ refers to all other alphabets than RU and EL
RU_BOLD_BIG: '$ru|bold|big|' ZBYTEK_SLOVA;
RU_BOLD: '$ru|bold|' ZBYTEK_SLOVA;
RU_OTHER: '$ru|' ZBYTEK_SLOVA;
EL_SLOVO: '$el' '|'  ZBYTEK_SLOVA;
LA_SLOVO: '$' JINY_JAZYK '|' ZBYTEK_SLOVA;

// ANTLR doesn't allow negation of more than one token / character
// we  chose to list all possibilities in order to keep the resulting parse tree cleaner
fragment
JINY_JAZYK: 'cs' | 'de' | 'la' | ; // we must allow empty match

fragment
ZBYTEK_SLOVA: .*? '$';
//ZBYTEK_SLOVA: '|' FONTSIZE '|' CONTENT '|' POZICE '|' WC '|' IMG_ID '|' FONT '}';
//['LANG', 'STYLE', 'FONTSIZE', 'CONTENT', 'IMG_ID', 'FONT' ]

SLOVNI_DRUH: 'm.'
           | 'f.'
           | 'n.'
           | 'adj.'
           | 'poss.'
           | 'pron.'
           | 'num. nota'
           | 'num.'
           | 'ipf.'
           | 'pf.'
           | 'praep.'
           | 'conj.'
           | 'part.'
           | 'interj.'
           | 'ptc.'
//            | ('m'|'f') '\n.,\n' 'nom propr.'
           | 'nom.'
           | 'propr.'
           ;

INDENT: 'INDENT';
PAR: 'PAR';
RS: 'RS';
SIPKA: 'v.'; //'→';

WS: '\n' -> skip;
//ErrorCharacter: . ;