Added pre-processing

author: Sébastien Dailly <sebastien@chimrod.com> 2021-08-24 15:04:25 +0200
committer: Sébastien Dailly <sebastien@chimrod.com> 2021-08-24 15:04:25 +0200
commit: df92da631e9a10a099a72ba846f90adf99d180df (patch)
tree: c56cffa7045795c0cc494512c28fc34a63f800bc /src/lib
parent: 546afdcf2148087f3a90b69c23ea756550f64433 (diff)
8 files changed, 152 insertions, 23 deletions
diff --git a/src/lib/dune b/src/lib/dune
index 6693079..29b0668 100644
--- a/src/lib/dune
+++ b/src/lib/dune
@@ -10,6 +10,11 @@
   (flags --only-tokens) )
 
 (menhir   
+  (modules tokens prononciation)
+  (merge_into prononciation)
+  (flags --external-tokens Tokens --table --explain --dump) )
+
+(menhir   
   (modules tokens parser)
   (merge_into parser)
   (flags --external-tokens Tokens --table --explain --dump) )
diff --git a/src/lib/lexer.mll b/src/lib/lexer.mll
index 236e353..f8a7f2c 100644
--- a/src/lib/lexer.mll
+++ b/src/lib/lexer.mll
@@ -17,7 +17,6 @@ rule letter = parse
 | '\xC3' '\xA9' { E_ACUTE }
 | 'f'           { F }
 | 'g'           { G }
-| "gu"          { G }
 | 'h'           { H }
 | 'i'           { I }
 | 'j'           { J }
@@ -29,13 +28,10 @@ rule letter = parse
 | "nn"          { N }
 | 'o'           { O }
 | 'p'           { P }
-| "ph"          { F }
 | 'q'           { Q }
-| "qu"          { K }
 | 'r'           { R }
 | 'u'           { U }
 | 's'           { S }
-| "ss"          { SS }
 | 't'           { T }
 | 'u'           { U }
 | 'v'           { V }
diff --git a/src/lib/parser.mly b/src/lib/parser.mly
index acb8e25..b4b7817 100644
--- a/src/lib/parser.mly
+++ b/src/lib/parser.mly
@@ -27,17 +27,17 @@ occlusiv:
   | T               { T.t }
   | D               { T.none }
 
-  | C               { T.k }
   | K               { T.k }
   | G               { T.none }
 
 fricativ:
   | S               { T.s () }
-  | SS              { T.s () }
+  | SZ              { T.sz () }
+  | Z               { T.z () }
 
   | F               { T.f }
 
-  | C H             { T.ch () }
+  | X               { T.ch () }
 
 obstruent:
   | occlusiv        { $1 }
@@ -68,6 +68,7 @@ voyels:
   | E_ACUTE E?      { T.e  `Closed , T.e  `Closed }
   | E U             { T.eu `Opened , T.eu `Opened }
   | O               { T.o  `Opened , T.o  `Opened }
+  | A_NASAL         { T.nasal (T.a  `Opened), T.nasal (T.a  `Opened) }
 
 nasal_voyels:
   | A N             { T.a' ()      , T.a' ()      }
diff --git a/src/lib/process.ml b/src/lib/process.ml
index 9bfd45d..10b2945 100644
--- a/src/lib/process.ml
+++ b/src/lib/process.ml
@@ -56,7 +56,7 @@ module M(T:Sounds.T) = struct
       match c with
       | None -> init
       | Some op -> match op.opening, op.ending with
-        | hd::[], None when hd = T.s () ->
+        | hd::[], None when T.code hd = T.SZ ->
           let c = Some { op with opening = [T.z ()] } in
           (((v1, v2), c) , ending)
         | _ -> init
diff --git a/src/lib/prononciation.mly b/src/lib/prononciation.mly
new file mode 100644
index 0000000..f865abd
--- /dev/null
+++ b/src/lib/prononciation.mly
@@ -0,0 +1,72 @@
+%{
+
+    (** This module transform the words into a semi-prononciation elements. 
+
+    The letters should be quite close with the associate phonem, but some
+    modification can occur.
+
+     *)
+
+    open Tokens
+%}
+
+%start<Tokens.token list> main
+%%
+
+initial_voyel:
+  | A { A }
+  | E { E }
+  | I { I }
+  | O { O }
+  | U { U }
+  | E_ACUTE { E_ACUTE }
+
+voyel: 
+  | initial_voyel { $1 }
+  
+
+letters:
+  |                 { [] }
+  | letters voyel  { $2 :: $1 }
+  | letters Space   { Space :: $1 }
+  | letters Sep     { Sep :: $1 }
+
+  | letters B       { B :: $1 }
+  | letters C  %prec Low     { K :: $1 }
+  | letters C H     { X :: $1 }
+  | letters C I     { I :: S :: $1 }
+  | letters C E     { E :: S :: $1 }
+  | letters C U I   { I :: K :: $1 }
+  | letters C U E   { E :: K :: $1 }
+  | letters D       { D :: $1 }
+  | letters F       { F :: $1 }
+  | letters G %prec Low      { G :: $1 }
+  | letters G I     { I :: J :: $1 }
+  | letters G E     { E :: J :: $1 }
+  | letters G U I   { I :: G :: $1 }
+  | letters G U E   { E :: G :: $1 }
+
+  | letters J       { J :: $1 }
+  | letters K       { K :: $1 }
+  | letters L       { L :: $1 }
+  | letters M       { M :: $1 }
+  | letters N       { N :: $1 }
+  | letters P       { P :: $1 }
+  | letters P H     { F :: $1 }
+
+  | letters Q U     { K :: $1 }
+
+  | letters R       { R :: $1 }
+  | letters S S     { S :: $1 }
+  | letters S       { SZ :: $1 }
+  | letters T       { T :: $1 }
+
+  | letters V       { V :: $1 }
+  | letters W       { V :: $1 }
+  | letters X       { S :: K :: $1 }
+  | letters Y       { I :: $1 }
+
+  | letters Z       { Z :: $1 }
+
+main: 
+    letters EOL { EOL::$1 }
diff --git a/src/lib/reader.ml b/src/lib/reader.ml
index 4b123d5..20dc9cc 100644
--- a/src/lib/reader.ml
+++ b/src/lib/reader.ml
@@ -15,23 +15,54 @@ let sound_to_string
 let succeed (res : Sounds.t list) =
   Ok (sound_to_string res)
 
-let fail lexbuf (_ : 'a I.checkpoint) =
-  Error (
-    Printf.sprintf
-      "At offset %d: syntax error."
-      (Lexing.lexeme_start lexbuf)
-  )
+let fail (_ : 'a I.checkpoint) =
+  Error ("Syntax Error")
 
-(* The parser has suspended itself because of a syntax error. Stop. *)
+let get_element lexbuf checkpoint =
+  let token = Lexer.letter lexbuf in
+  let startp = lexbuf.lex_start_p
+  and endp = lexbuf.lex_curr_p in
+  I.offer checkpoint (token, startp, endp)
 
-let loop lexbuf result =
-  let supplier = I.lexer_lexbuf_to_supplier Lexer.letter lexbuf in
-  I.loop_handle succeed (fail lexbuf) supplier result
+let rec loop get_element (checkpoint : Sounds.t list I.checkpoint) =
+  match checkpoint with
+  | I.InputNeeded _env ->
+    (* The parser needs a token. Request one from the lexer,
+       and offer it to the parser, which will produce a new
+       checkpoint. Then, repeat. *)
+    let checkpoint = get_element checkpoint in
+    loop get_element checkpoint
+  | I.Shifting _
+  | I.AboutToReduce _ ->
+    let checkpoint = I.resume checkpoint in
+    loop get_element checkpoint
+  | I.HandlingError _env ->
+    fail checkpoint
+  | I.Accepted v ->
+    (* The parser has succeeded and produced a semantic value. Print it. *)
+    succeed v
+  | I.Rejected ->
+    (* The parser rejects this input. This cannot happen, here, because
+       we stop as soon as the parser reports [HandlingError]. *)
+    assert false
+
+let build_processor ll =
+  let l = ref ll in
+  fun checkpoint ->
+    match !l with
+    | [] -> raise (Failure "Empty")
+    | hd::tl ->
+      l := tl;
+      I.offer checkpoint (hd, Lexing.dummy_pos, Lexing.dummy_pos)
 
 let process (line : string) =
   let lexbuf = Lexing.from_string line in
+  let content =
+    Prononciation.main Lexer.letter lexbuf
+    |> List.rev
+    |> build_processor in
   try
-    loop lexbuf (Parser.Incremental.main lexbuf.lex_curr_p)
+    loop content (Parser.Incremental.main lexbuf.lex_curr_p)
   with
   | Lexer.Error msg -> Error msg
 
diff --git a/src/lib/sounds.ml b/src/lib/sounds.ml
index ec1ddf2..0ee9f5c 100644
--- a/src/lib/sounds.ml
+++ b/src/lib/sounds.ml
@@ -22,6 +22,7 @@ module type T = sig
   val k: t
   val f: t
   val s: unit -> t
+  val sz: unit -> t
   val ch: unit -> t
   val z: unit -> t
 
@@ -33,6 +34,12 @@ module type T = sig
   val is_voyel : t -> bool
   val is_nasal : t -> bool
 
+  type code =
+    | None
+    | SZ
+
+  val code : t -> code
+
 end
 
 module T = struct
@@ -40,12 +47,18 @@ module T = struct
     | None
     | Voyel
 
+  type code =
+    | None
+    | SZ
+
   type t =
-    { repr : string
+    { code : code
+    ; repr : string
     ; muted : bool
     ; kind : kind
     ; nasal : bool
     }
+
 end
 
 module Repr = struct
@@ -68,7 +81,10 @@ module S = struct
     { repr = "."
     ; muted = false
     ; kind = None
-    ; nasal = false }
+    ; nasal = false
+    ; code = None }
+
+  let code t = t.code
 
   let nasal t =
     match t.repr with
@@ -120,6 +136,9 @@ module S = struct
   let s () =
     { none with repr = "s" }
 
+  let sz () =
+    { (s()) with code = SZ }
+
   let ch () =
     { none with repr = "S" }
 
diff --git a/src/lib/tokens.mly b/src/lib/tokens.mly
index 428c744..5346005 100644
--- a/src/lib/tokens.mly
+++ b/src/lib/tokens.mly
@@ -1,5 +1,10 @@
 %token Sep
 
+%token A_NASAL
+%token O_NASAL
+%token I_NASAL
+
+
 %token A
 %token B
 %token C
@@ -20,7 +25,7 @@
 %token P
 %token R
 %token S
-%token SS
+%token SZ
 %token T
 %token U
 %token V
@@ -37,7 +42,7 @@
 
 %right C H J Q V W X Y Z
 %right P B T D K G
-%right S SS F
+%right S F
 
 %right N M
 %right L R
author	Sébastien Dailly <sebastien@chimrod.com>	2021-08-24 15:04:25 +0200
committer	Sébastien Dailly <sebastien@chimrod.com>	2021-08-24 15:04:25 +0200
commit	df92da631e9a10a099a72ba846f90adf99d180df (patch)
tree	c56cffa7045795c0cc494512c28fc34a63f800bc /src/lib
parent	546afdcf2148087f3a90b69c23ea756550f64433 (diff)