From df92da631e9a10a099a72ba846f90adf99d180df Mon Sep 17 00:00:00 2001 From: Sébastien Dailly Date: Tue, 24 Aug 2021 15:04:25 +0200 Subject: Added pre-processing --- src/lib/dune | 5 ++++ src/lib/lexer.mll | 4 --- src/lib/parser.mly | 7 +++-- src/lib/process.ml | 2 +- src/lib/prononciation.mly | 72 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib/reader.ml | 53 ++++++++++++++++++++++++++-------- src/lib/sounds.ml | 23 +++++++++++++-- src/lib/tokens.mly | 9 ++++-- src/test/test.ml | 6 ++++ 9 files changed, 158 insertions(+), 23 deletions(-) create mode 100644 src/lib/prononciation.mly diff --git a/src/lib/dune b/src/lib/dune index 6693079..29b0668 100644 --- a/src/lib/dune +++ b/src/lib/dune @@ -9,6 +9,11 @@ (modules tokens) (flags --only-tokens) ) +(menhir + (modules tokens prononciation) + (merge_into prononciation) + (flags --external-tokens Tokens --table --explain --dump) ) + (menhir (modules tokens parser) (merge_into parser) diff --git a/src/lib/lexer.mll b/src/lib/lexer.mll index 236e353..f8a7f2c 100644 --- a/src/lib/lexer.mll +++ b/src/lib/lexer.mll @@ -17,7 +17,6 @@ rule letter = parse | '\xC3' '\xA9' { E_ACUTE } | 'f' { F } | 'g' { G } -| "gu" { G } | 'h' { H } | 'i' { I } | 'j' { J } @@ -29,13 +28,10 @@ rule letter = parse | "nn" { N } | 'o' { O } | 'p' { P } -| "ph" { F } | 'q' { Q } -| "qu" { K } | 'r' { R } | 'u' { U } | 's' { S } -| "ss" { SS } | 't' { T } | 'u' { U } | 'v' { V } diff --git a/src/lib/parser.mly b/src/lib/parser.mly index acb8e25..b4b7817 100644 --- a/src/lib/parser.mly +++ b/src/lib/parser.mly @@ -27,17 +27,17 @@ occlusiv: | T { T.t } | D { T.none } - | C { T.k } | K { T.k } | G { T.none } fricativ: | S { T.s () } - | SS { T.s () } + | SZ { T.sz () } + | Z { T.z () } | F { T.f } - | C H { T.ch () } + | X { T.ch () } obstruent: | occlusiv { $1 } @@ -68,6 +68,7 @@ voyels: | E_ACUTE E? { T.e `Closed , T.e `Closed } | E U { T.eu `Opened , T.eu `Opened } | O { T.o `Opened , T.o `Opened } + | A_NASAL { T.nasal (T.a `Opened), T.nasal (T.a `Opened) } nasal_voyels: | A N { T.a' () , T.a' () } diff --git a/src/lib/process.ml b/src/lib/process.ml index 9bfd45d..10b2945 100644 --- a/src/lib/process.ml +++ b/src/lib/process.ml @@ -56,7 +56,7 @@ module M(T:Sounds.T) = struct match c with | None -> init | Some op -> match op.opening, op.ending with - | hd::[], None when hd = T.s () -> + | hd::[], None when T.code hd = T.SZ -> let c = Some { op with opening = [T.z ()] } in (((v1, v2), c) , ending) | _ -> init diff --git a/src/lib/prononciation.mly b/src/lib/prononciation.mly new file mode 100644 index 0000000..f865abd --- /dev/null +++ b/src/lib/prononciation.mly @@ -0,0 +1,72 @@ +%{ + + (** This module transform the words into a semi-prononciation elements. + + The letters should be quite close with the associate phonem, but some + modification can occur. + + *) + + open Tokens +%} + +%start main +%% + +initial_voyel: + | A { A } + | E { E } + | I { I } + | O { O } + | U { U } + | E_ACUTE { E_ACUTE } + +voyel: + | initial_voyel { $1 } + + +letters: + | { [] } + | letters voyel { $2 :: $1 } + | letters Space { Space :: $1 } + | letters Sep { Sep :: $1 } + + | letters B { B :: $1 } + | letters C %prec Low { K :: $1 } + | letters C H { X :: $1 } + | letters C I { I :: S :: $1 } + | letters C E { E :: S :: $1 } + | letters C U I { I :: K :: $1 } + | letters C U E { E :: K :: $1 } + | letters D { D :: $1 } + | letters F { F :: $1 } + | letters G %prec Low { G :: $1 } + | letters G I { I :: J :: $1 } + | letters G E { E :: J :: $1 } + | letters G U I { I :: G :: $1 } + | letters G U E { E :: G :: $1 } + + | letters J { J :: $1 } + | letters K { K :: $1 } + | letters L { L :: $1 } + | letters M { M :: $1 } + | letters N { N :: $1 } + | letters P { P :: $1 } + | letters P H { F :: $1 } + + | letters Q U { K :: $1 } + + | letters R { R :: $1 } + | letters S S { S :: $1 } + | letters S { SZ :: $1 } + | letters T { T :: $1 } + + | letters V { V :: $1 } + | letters W { V :: $1 } + | letters X { S :: K :: $1 } + | letters Y { I :: $1 } + + | letters Z { Z :: $1 } + +main: + letters EOL { EOL::$1 } diff --git a/src/lib/reader.ml b/src/lib/reader.ml index 4b123d5..20dc9cc 100644 --- a/src/lib/reader.ml +++ b/src/lib/reader.ml @@ -15,23 +15,54 @@ let sound_to_string let succeed (res : Sounds.t list) = Ok (sound_to_string res) -let fail lexbuf (_ : 'a I.checkpoint) = - Error ( - Printf.sprintf - "At offset %d: syntax error." - (Lexing.lexeme_start lexbuf) - ) +let fail (_ : 'a I.checkpoint) = + Error ("Syntax Error") -(* The parser has suspended itself because of a syntax error. Stop. *) +let get_element lexbuf checkpoint = + let token = Lexer.letter lexbuf in + let startp = lexbuf.lex_start_p + and endp = lexbuf.lex_curr_p in + I.offer checkpoint (token, startp, endp) -let loop lexbuf result = - let supplier = I.lexer_lexbuf_to_supplier Lexer.letter lexbuf in - I.loop_handle succeed (fail lexbuf) supplier result +let rec loop get_element (checkpoint : Sounds.t list I.checkpoint) = + match checkpoint with + | I.InputNeeded _env -> + (* The parser needs a token. Request one from the lexer, + and offer it to the parser, which will produce a new + checkpoint. Then, repeat. *) + let checkpoint = get_element checkpoint in + loop get_element checkpoint + | I.Shifting _ + | I.AboutToReduce _ -> + let checkpoint = I.resume checkpoint in + loop get_element checkpoint + | I.HandlingError _env -> + fail checkpoint + | I.Accepted v -> + (* The parser has succeeded and produced a semantic value. Print it. *) + succeed v + | I.Rejected -> + (* The parser rejects this input. This cannot happen, here, because + we stop as soon as the parser reports [HandlingError]. *) + assert false + +let build_processor ll = + let l = ref ll in + fun checkpoint -> + match !l with + | [] -> raise (Failure "Empty") + | hd::tl -> + l := tl; + I.offer checkpoint (hd, Lexing.dummy_pos, Lexing.dummy_pos) let process (line : string) = let lexbuf = Lexing.from_string line in + let content = + Prononciation.main Lexer.letter lexbuf + |> List.rev + |> build_processor in try - loop lexbuf (Parser.Incremental.main lexbuf.lex_curr_p) + loop content (Parser.Incremental.main lexbuf.lex_curr_p) with | Lexer.Error msg -> Error msg diff --git a/src/lib/sounds.ml b/src/lib/sounds.ml index ec1ddf2..0ee9f5c 100644 --- a/src/lib/sounds.ml +++ b/src/lib/sounds.ml @@ -22,6 +22,7 @@ module type T = sig val k: t val f: t val s: unit -> t + val sz: unit -> t val ch: unit -> t val z: unit -> t @@ -33,6 +34,12 @@ module type T = sig val is_voyel : t -> bool val is_nasal : t -> bool + type code = + | None + | SZ + + val code : t -> code + end module T = struct @@ -40,12 +47,18 @@ module T = struct | None | Voyel + type code = + | None + | SZ + type t = - { repr : string + { code : code + ; repr : string ; muted : bool ; kind : kind ; nasal : bool } + end module Repr = struct @@ -68,7 +81,10 @@ module S = struct { repr = "." ; muted = false ; kind = None - ; nasal = false } + ; nasal = false + ; code = None } + + let code t = t.code let nasal t = match t.repr with @@ -120,6 +136,9 @@ module S = struct let s () = { none with repr = "s" } + let sz () = + { (s()) with code = SZ } + let ch () = { none with repr = "S" } diff --git a/src/lib/tokens.mly b/src/lib/tokens.mly index 428c744..5346005 100644 --- a/src/lib/tokens.mly +++ b/src/lib/tokens.mly @@ -1,5 +1,10 @@ %token Sep +%token A_NASAL +%token O_NASAL +%token I_NASAL + + %token A %token B %token C @@ -20,7 +25,7 @@ %token P %token R %token S -%token SS +%token SZ %token T %token U %token V @@ -37,7 +42,7 @@ %right C H J Q V W X Y Z %right P B T D K G -%right S SS F +%right S F %right N M %right L R diff --git a/src/test/test.ml b/src/test/test.ml index ca74a9e..5d1d09c 100644 --- a/src/test/test.ml +++ b/src/test/test.ml @@ -17,6 +17,11 @@ let process (optional_line : string option) expected = | Ok response -> if String.equal response expected then () + (* + (Printf.fprintf stdout "%s : %s OK \n%!" + line + expected) + *) else ( error := 1; (Printf.fprintf stdout "%s : got %s / %s expected\n%!" @@ -41,6 +46,7 @@ let tests = ; "abaissées", "abEse(s)" ; "abaissera", "abEs°Ra" ; "achat", "aSa(t)" + ; "as", "a(s)" ; "astiqué", "astike" ; "casait", "kazE(t)" ; "cassait", "kasE(t)" -- cgit v1.2.3