
// Gorazd Generator
// Generator of dictionary entries from ALTO XML.
// Copyright (C) 2018  Vít Tuček, Slovanský ústav AV ČR, v. v. i.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

grammar RSIRecord;

slovnik: (heslo | odkazove_heslo)+;
text_hesla: (slovo | slovni_druh | ',' )+;
slovo: ru_slovo| el_slovo | LA_SLOVO;
nerozpoznano: .+?;

// ODKAZOVE HESLO
odkazove_heslo: RS odkazova_hlavicka;
odkazova_hlavicka: zahlavi (',' nasobne_zahlavi)? SIPKA odkaz;
odkaz: (EL_BOLD_BIG | EL_BIG)+;

// HESLO
heslo: RS hlavicka (odstavec | podvyznam)*;// nerozpoznano?;

//hlavicka: zahlavi sloveso? slovni_druhy? text_hesla?
//        | zahlavi nasobne_zahlavi slovni_druhy? text_hesla?
//        ;

hlavicka: zahlavi (',' nasobne_zahlavi | ',' sloveso)* hlavicka_text_hesla?
        | zahlavi (',' nasobne_zahlavi | slovni_druh | hlavicka_text_hesla)*;
//hlavicka: zahlavi ((',' nasobne_zahlavi | ',' sloveso)* | slovni_druhy)? hlavicka_text_hesla?;
hlavicka_text_hesla: (LA_SLOVO+ | EL_NORMAL+ | ',')+;

sloveso: EL_BIG+;

zahlavi: PAR EL_BOLD_BIG+;
nasobne_zahlavi: EL_BOLD_BIG+;

podvyznam: podzahlavi hlavicka_text_hesla?
         | podzahlavi SIPKA odkaz;

podzahlavi: PAR (EL_BIG | ',')+ text_hesla?;

//slovni_druhy: slovni_druh (slovni_druh | ',' | hlavicka_text_hesla)*;
slovni_druh: SLOVNI_DRUH+;

odstavec: INDENT PAR ssl_zahlavi? text_hesla;

ssl_zahlavi: RU_BOLD_BIG (RU_BOLD_BIG | ',')*;

ru_slovo: RU_OTHER | RU_BOLD_BIG;
el_slovo: EL_BIG | EL_BOLD_BIG | EL_NORMAL | EL_OTHER;

//                                               LEXER RULES

// _NORMAL is actually anything other than italics
// LA_ refers to all other alphabets than RU and EL
RU_BOLD_BIG: '$ru|bold|big|' ZBYTEK_SLOVA;
RU_OTHER: '$ru|' ZBYTEK_SLOVA;
EL_BOLD_BIG: '$el|bold|big|' ZBYTEK_SLOVA;
EL_BIG: '$el||big|' ZBYTEK_SLOVA;
EL_NORMAL: '$el||normal|' ZBYTEK_SLOVA;
EL_OTHER: '$el' '|'  ZBYTEK_SLOVA;
LA_SLOVO: '$' JINY_JAZYK '|' ZBYTEK_SLOVA;

// ANTLR doesn't allow negation of more than one token / character
// we  chose to list all possibilities in order to keep the resulting parse tree cleaner
fragment
JINY_JAZYK: 'cs' | 'de' | 'la' | ; // we must allow empty match

fragment
ZBYTEK_SLOVA: .*? '$';
//ZBYTEK_SLOVA: '|' FONTSIZE '|' CONTENT '|' POZICE '|' WC '|' IMG_ID '|' FONT '}';
//['LANG', 'STYLE', 'FONTSIZE', 'CONTENT', 'IMG_ID', 'FONT' ]

SLOVNI_DRUH: 'm'
            | 'f'
            | 'n'
            | 'adj'
            | 'pron'
            | 'num nota'
            | 'num'
            | 'adv'
            | 'conj'
            | 'praep'
            | 'part'
            | 'interj'
            | 'propr'
            | 'indecl'
           ;

INDENT: 'INDENT';
PAR: 'PAR';
RS: 'RS';
SIPKA: '\u2192'; //'→';

WS: '\n' -> skip;
//ErrorCharacter: . ;