summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSébastien Dailly <sebastien@chimrod.com>2021-08-23 14:37:53 +0200
committerSébastien Dailly <sebastien@chimrod.com>2021-08-23 14:37:53 +0200
commit546afdcf2148087f3a90b69c23ea756550f64433 (patch)
treeac56c71393aacf0fade729e98eeecb1e87a88534
Initial commit
-rw-r--r--Makefile5
-rw-r--r--dune-project15
-rw-r--r--src/bin/dune6
-rw-r--r--src/bin/transcriptor.ml23
-rw-r--r--src/lib/.mly24
-rw-r--r--src/lib/dune17
-rw-r--r--src/lib/lexer.mll64
-rw-r--r--src/lib/parser.mly118
-rw-r--r--src/lib/process.ml115
-rw-r--r--src/lib/reader.ml37
-rw-r--r--src/lib/sounds.ml142
-rw-r--r--src/lib/tokens.mly46
-rw-r--r--src/test/dune9
-rw-r--r--src/test/test.ml66
14 files changed, 687 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0e365db
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,5 @@
+all:
+ dune build
+
+test:
+ dune runtest src/test
diff --git a/dune-project b/dune-project
new file mode 100644
index 0000000..fddf7a5
--- /dev/null
+++ b/dune-project
@@ -0,0 +1,15 @@
+(lang dune 2.9)
+(name transcriptor)
+(generate_opam_files true)
+
+(using menhir 2.0)
+
+
+(package
+ (name transcriptor)
+ (synopsis "Text transcriptor")
+ (depends
+ (menhirLib (>= 20210419))
+ )
+ )
+
diff --git a/src/bin/dune b/src/bin/dune
new file mode 100644
index 0000000..5d49b1d
--- /dev/null
+++ b/src/bin/dune
@@ -0,0 +1,6 @@
+(executables
+ (names
+ transcriptor
+ )
+ (libraries translator)
+ )
diff --git a/src/bin/transcriptor.ml b/src/bin/transcriptor.ml
new file mode 100644
index 0000000..2378c5d
--- /dev/null
+++ b/src/bin/transcriptor.ml
@@ -0,0 +1,23 @@
+module T = Translator
+module P = T.Parser
+
+module Parser = P.Make(T.Sounds)
+module I = Parser.MenhirInterpreter
+
+let process (optional_line : string option) =
+ match optional_line with
+ | None -> ()
+ | Some line ->
+ match T.Reader.process line with
+ | Ok response -> print_endline response
+ | Error err -> print_endline err
+
+let rec repeat channel =
+ (* Attempt to read one line. *)
+ let optional_line, continue = T.Lexer.line channel in
+ process optional_line;
+ if continue then
+ repeat channel
+
+let () =
+ repeat (Lexing.from_channel stdin)
diff --git a/src/lib/.mly b/src/lib/.mly
new file mode 100644
index 0000000..464a065
--- /dev/null
+++ b/src/lib/.mly
@@ -0,0 +1,24 @@
+%start<'a T.t List.t> main
+
+
+initial:
+ | { }
+
+medium:
+ | { }
+
+ending
+ | {}
+
+word:
+ | initial medium* ending
+
+
+main:
+ | {}
+ | main word Spaces+
+ {
+
+
+ }
+
diff --git a/src/lib/dune b/src/lib/dune
new file mode 100644
index 0000000..6693079
--- /dev/null
+++ b/src/lib/dune
@@ -0,0 +1,17 @@
+(library
+ (name
+ translator
+ )
+ (libraries menhirLib)
+ )
+
+(menhir
+ (modules tokens)
+ (flags --only-tokens) )
+
+(menhir
+ (modules tokens parser)
+ (merge_into parser)
+ (flags --external-tokens Tokens --table --explain --dump) )
+
+(ocamllex lexer)
diff --git a/src/lib/lexer.mll b/src/lib/lexer.mll
new file mode 100644
index 0000000..236e353
--- /dev/null
+++ b/src/lib/lexer.mll
@@ -0,0 +1,64 @@
+{
+
+ open Tokens
+
+ exception Error of string
+
+}
+
+rule letter = parse
+| '|' { Sep }
+| 'a' { A }
+| 'b' { B }
+| 'c' { C }
+| 'd' { D }
+| 'e' { E }
+| '\233' { E_ACUTE }
+| '\xC3' '\xA9' { E_ACUTE }
+| 'f' { F }
+| 'g' { G }
+| "gu" { G }
+| 'h' { H }
+| 'i' { I }
+| 'j' { J }
+| 'k' { K }
+| 'l' { L }
+| 'm' { M }
+| "mm" { M }
+| 'n' { N }
+| "nn" { N }
+| 'o' { O }
+| 'p' { P }
+| "ph" { F }
+| 'q' { Q }
+| "qu" { K }
+| 'r' { R }
+| 'u' { U }
+| 's' { S }
+| "ss" { SS }
+| 't' { T }
+| 'u' { U }
+| 'v' { V }
+| 'w' { W }
+| 'x' { X }
+| 'y' { Y }
+| 'z' { Z }
+| ' ' { Space }
+| '\n' { EOL }
+| eof { EOL }
+
+(* This rule looks for a single line, terminated with '\n' or eof.
+ It returns a pair of an optional string (the line that was found)
+ and a Boolean flag (false if eof was reached). *)
+
+and line = parse
+| ([^'\n']* '\n') as line
+ (* Normal case: one line, no eof. *)
+ { Some line, true }
+| eof
+ (* Normal case: no data, eof. *)
+ { None, false }
+| ([^'\n']+ as line) eof
+ (* Special case: some data but missing '\n', then eof.
+ Consider this as the last line, and add the missing '\n'. *)
+ { Some (line), false }
diff --git a/src/lib/parser.mly b/src/lib/parser.mly
new file mode 100644
index 0000000..acb8e25
--- /dev/null
+++ b/src/lib/parser.mly
@@ -0,0 +1,118 @@
+(*
+
+See [1] for the theory behind the analysis
+
+
+[1] https://fr.m.wiktionary.org/wiki/Annexe:Prononciation/fran%C3%A7ais#Structure_syllabique
+
+
+ *)
+%parameter<T:Sounds.T>
+
+
+
+%{
+
+ module P = Process.M(T)
+
+%}
+%start<T.t List.t> main
+
+%%
+
+occlusiv:
+ | P { T.p }
+ | B { T.b }
+
+ | T { T.t }
+ | D { T.none }
+
+ | C { T.k }
+ | K { T.k }
+ | G { T.none }
+
+fricativ:
+ | S { T.s () }
+ | SS { T.s () }
+
+ | F { T.f }
+
+ | C H { T.ch () }
+
+obstruent:
+ | occlusiv { $1 }
+ | fricativ { $1 }
+
+liquid:
+ | L { T.l () }
+ | R { T.r () }
+
+nasal:
+ | N { T.n () }
+
+opening_consonant:
+ | occlusiv { $1, None }
+ | fricativ { $1, None }
+ | nasal { $1, None }
+ | liquid { $1, None }
+ | obstruent liquid { $1, Some $2 }
+ | occlusiv fricativ { $1, Some $2 }
+
+(* Each voyel as two associated sounds, depending there is a followng sound or
+ not *)
+voyels:
+ | A { T.a `Opened , T.a `Opened }
+ | A I { T.e `Opened, T.e `Opened }
+ | I { T.i `Opened , T.i `Opened }
+ | E { T.schwa () , T.schwa () }
+ | E_ACUTE E? { T.e `Closed , T.e `Closed }
+ | E U { T.eu `Opened , T.eu `Opened }
+ | O { T.o `Opened , T.o `Opened }
+
+nasal_voyels:
+ | A N { T.a' () , T.a' () }
+ %prec Low
+
+ending_consonant:
+ | S { Some (T.s ()) }
+ | T { None }
+ | R { Some (T.r ()) }
+ | nasal { Some $1 }
+
+ending_word:
+ | X { Some (T.muted (T.s ())) }
+ | S { Some (T.muted (T.s ())) }
+ | R { Some (T.muted (T.r ())) }
+ | T { Some (T.muted T.t) }
+
+consonant_group:
+ | opening_consonant
+ {
+ { ending = None
+ ; opening = [ fst $1 ]
+ ; following = snd $1 }
+ }
+ | ending_consonant
+ opening_consonant
+ {
+ { ending = Some $1
+ ; opening = [ fst $2 ]
+ ; following = snd $2 }
+ }
+
+syllable:
+ | c = consonant_group?
+ v = voyels
+ { (v, c) }
+
+
+syllables:
+ | { [] }
+ | ss = syllables s = syllable { s::ss }
+
+
+word:
+ | syllables ending_word? EOL { P.rebuild $2 $1 }
+
+main:
+ | word { $1 }
diff --git a/src/lib/process.ml b/src/lib/process.ml
new file mode 100644
index 0000000..9bfd45d
--- /dev/null
+++ b/src/lib/process.ml
@@ -0,0 +1,115 @@
+open StdLabels
+module M(T:Sounds.T) = struct
+
+ type voyel = (T.t * T.t )
+
+ type consonants =
+ { ending : T.t option option
+ ; opening : T.t list
+ ; following : T.t option }
+
+ type group = voyel * consonants option
+
+ type modifier = (group * T.t option option) -> (group * T.t option option)
+
+ (** Apply all the modifiers to the syllabic group in order to correct the
+ relation between the elements
+
+ This is just a fold_left list, and the order matter : for example
+ nasalisation shall be applied after the S vocalisation
+
+ *)
+ let apply_modifiers
+ : group * T.t option option -> modifier list -> group * T.t option option
+ = fun e m ->
+ List.fold_left m
+ ~init:e
+ ~f:(fun e f -> f e)
+
+ (** The Nasal modifier transform a voyel followed by N and a consonant
+ into a nasal voyel.
+
+ Does this min that nasal voyel are not a distinct element, but just a
+ merge from two distinct elements ? *)
+ let nasal
+ : modifier
+ = fun init ->
+ let (((v1, v2), c) , ending) = init in
+ let ending = Option.bind ending (fun x -> x) in
+ match ending with
+ | None -> init
+ | Some ending ->
+ match T.is_nasal ending with
+ | false -> init
+ | true ->
+ (* Remove the ending consonant, and transform the voyel into
+ the nasal form *)
+ ( ( (T.nasal v1, T.nasal v2)
+ , c )
+ , None )
+
+ let vocalize_s
+ : modifier
+ = fun init ->
+ let (((v1, v2), c) , ending) = init in
+
+ match c with
+ | None -> init
+ | Some op -> match op.opening, op.ending with
+ | hd::[], None when hd = T.s () ->
+ let c = Some { op with opening = [T.z ()] } in
+ (((v1, v2), c) , ending)
+ | _ -> init
+
+ let rec _rebuild ~(m:modifier list) acc ending_consonant : group list -> T.t list
+ = function
+ | [] -> acc
+ | hd::tl ->
+
+ let modifier_ = vocalize_s :: nasal :: m in
+ let (voyel, consonants), ending_consonant =
+ apply_modifiers
+ (hd, ending_consonant)
+ modifier_ in
+
+ (* Add the last consonant and the voyel *)
+ let m, acc = match ending_consonant with
+ | None -> modifier_, (snd voyel)::acc
+ | Some s ->
+ let default = modifier_, (fst voyel) :: acc in
+ match s with
+ | None -> default
+ | Some s ->
+
+ modifier_, (fst voyel) :: s::acc in
+
+ match consonants with
+ | None -> _rebuild ~m acc None tl
+ | Some {ending; opening; following} ->
+
+ let acc = match following with
+ | None -> acc
+ | Some s -> s::acc in
+
+ match opening with
+ | [] ->_rebuild ~m acc ending tl
+ | opening -> _rebuild ~m (opening @ acc) ending tl
+
+ (** Rebuild the list in the normal order
+
+ The voyels have to be choosen, depending either they are followed by a
+ consonant or not
+
+ Some consonants may be changed depending of the following voyel
+
+ The list has to be reversed
+
+ and so one
+
+ *)
+ let rebuild
+ : T.t option option -> group list -> T.t list
+ = fun ending elems ->
+ _rebuild ~m:[] [] ending elems
+
+end
diff --git a/src/lib/reader.ml b/src/lib/reader.ml
new file mode 100644
index 0000000..4b123d5
--- /dev/null
+++ b/src/lib/reader.ml
@@ -0,0 +1,37 @@
+open StdLabels
+
+module P = Parser
+module Parser = P.Make(Sounds)
+module I = Parser.MenhirInterpreter
+
+let sound_to_string
+ : Sounds.t list -> string
+ = fun t ->
+ let buff = Buffer.create 16 in
+ List.iter t
+ ~f:(fun f -> Buffer.add_string buff f.Sounds.repr);
+ Buffer.contents buff
+
+let succeed (res : Sounds.t list) =
+ Ok (sound_to_string res)
+
+let fail lexbuf (_ : 'a I.checkpoint) =
+ Error (
+ Printf.sprintf
+ "At offset %d: syntax error."
+ (Lexing.lexeme_start lexbuf)
+ )
+
+(* The parser has suspended itself because of a syntax error. Stop. *)
+
+let loop lexbuf result =
+ let supplier = I.lexer_lexbuf_to_supplier Lexer.letter lexbuf in
+ I.loop_handle succeed (fail lexbuf) supplier result
+
+let process (line : string) =
+ let lexbuf = Lexing.from_string line in
+ try
+ loop lexbuf (Parser.Incremental.main lexbuf.lex_curr_p)
+ with
+ | Lexer.Error msg -> Error msg
+
diff --git a/src/lib/sounds.ml b/src/lib/sounds.ml
new file mode 100644
index 0000000..ec1ddf2
--- /dev/null
+++ b/src/lib/sounds.ml
@@ -0,0 +1,142 @@
+module type T = sig
+
+ type t
+ val muted : t -> t
+
+ val a : [`Closed | `Opened] -> t
+ val e : [`Closed | `Opened] -> t
+ val eu : [`Closed | `Opened] -> t
+
+ val o : [`Closed | `Opened] -> t
+ val schwa : unit -> t
+ val i : [`Closed | `Opened] -> t
+
+
+ val nasal: t -> t
+
+ val none: t
+
+ val p: t
+ val b: t
+ val t: t
+ val k: t
+ val f: t
+ val s: unit -> t
+ val ch: unit -> t
+ val z: unit -> t
+
+ val n: unit -> t
+
+ val r: unit -> t
+ val l: unit -> t
+
+ val is_voyel : t -> bool
+ val is_nasal : t -> bool
+
+end
+
+module T = struct
+ type kind =
+ | None
+ | Voyel
+
+ type t =
+ { repr : string
+ ; muted : bool
+ ; kind : kind
+ ; nasal : bool
+ }
+end
+
+module Repr = struct
+
+ let a = "a"
+ and a_nasal = "@"
+
+ and o_nasal = "§"
+
+end
+
+module S = struct
+
+ include T
+
+ let is_voyel t = t.kind = Voyel
+ let is_nasal t = t.nasal
+
+ let none =
+ { repr = "."
+ ; muted = false
+ ; kind = None
+ ; nasal = false }
+
+ let nasal t =
+ match t.repr with
+ | "a" -> { t with repr = Repr.a_nasal ; nasal = true }
+ | "o" -> { t with repr = Repr.o_nasal ; nasal = true }
+ | _ -> t
+
+ let muted f =
+ { none with
+ repr = "(" ^ f.repr ^ ")"
+ ; muted = true }
+
+ let a _ =
+ { none with repr = Repr.a }
+
+ let e = function
+ | `Closed -> { none with repr = "e" }
+ | `Opened -> { none with repr = "E" }
+
+ let eu = function
+ | `Closed -> { none with repr = "2" }
+ | `Opened -> { none with repr = "9" }
+
+
+ let schwa () =
+ { none with repr = "°" }
+
+ let o _ =
+ { none with repr = "o" }
+
+ let i _ =
+ { none with repr = "i" }
+
+ let p =
+ { none with repr = "p" }
+
+ let b =
+ { none with repr = "b" }
+
+ let t =
+ { none with repr = "t" }
+
+ let k =
+ { none with repr = "k" }
+
+ let f =
+ { none with repr = "f" }
+
+ let s () =
+ { none with repr = "s" }
+
+ let ch () =
+ { none with repr = "S" }
+
+ let z () =
+ { none with repr = "z" }
+
+ let n () =
+ { none with
+ repr = "n"
+ ; nasal = true }
+
+ let l () =
+ { none with repr = "L" }
+
+ let r () =
+ { none with repr = "R" }
+
+end
+
+include S
diff --git a/src/lib/tokens.mly b/src/lib/tokens.mly
new file mode 100644
index 0000000..428c744
--- /dev/null
+++ b/src/lib/tokens.mly
@@ -0,0 +1,46 @@
+%token Sep
+
+%token A
+%token B
+%token C
+%token D
+%token E
+%token E_ACUTE
+%token F
+%token G
+%token H
+%token I
+%token J
+%token K
+%token L
+%token M
+%token N
+%token O
+%token Q
+%token P
+%token R
+%token S
+%token SS
+%token T
+%token U
+%token V
+%token W
+%token X
+%token Y
+%token Z
+%token Space
+%token EOL
+
+%nonassoc Low
+
+%right A E E_ACUTE I O U
+
+%right C H J Q V W X Y Z
+%right P B T D K G
+%right S SS F
+
+%right N M
+%right L R
+%right High
+
+%%
diff --git a/src/test/dune b/src/test/dune
new file mode 100644
index 0000000..f8dbc72
--- /dev/null
+++ b/src/test/dune
@@ -0,0 +1,9 @@
+(test
+ (name
+ test
+ )
+ (libraries translator)
+ )
+
+
+
diff --git a/src/test/test.ml b/src/test/test.ml
new file mode 100644
index 0000000..ca74a9e
--- /dev/null
+++ b/src/test/test.ml
@@ -0,0 +1,66 @@
+open StdLabels
+
+module T = Translator
+let error = ref 0
+
+let process (optional_line : string option) expected =
+ match optional_line with
+ | None -> ()
+ | Some line ->
+ match T.Reader.process line with
+ | Error result ->
+ error := 1;
+ Printf.fprintf stdout
+ "%s : %s\n%!"
+ line
+ result
+ | Ok response ->
+ if String.equal response expected then
+ ()
+ else (
+ error := 1;
+ (Printf.fprintf stdout "%s : got %s / %s expected\n%!"
+ line
+ response
+ expected)
+ )
+
+
+let rec repeat input expected =
+ (* Attempt to read one line. *)
+ let optional_line, continue = T.Lexer.line input in
+ process optional_line expected;
+ if continue then
+ repeat input expected
+
+let tests =
+ [ "abaca", "abaka"
+ ; "abaissa", "abEsa"
+ ; "abaissait", "abEsE(t)"
+ ; "abaissant", "abEs@(t)"
+ ; "abaissées", "abEse(s)"
+ ; "abaissera", "abEs°Ra"
+ ; "achat", "aSa(t)"
+ ; "astiqué", "astike"
+ ; "casait", "kazE(t)"
+ ; "cassait", "kasE(t)"
+ ; "chanci", "S@si"
+ ; "chat", "Sa(t)"
+ ; "chipant", "Sip@(t)"
+ ; "pacha", "paSa"
+ ; "péché", "peSe"
+ ; "persai", "pERse"
+ ; "asia", "azia"
+ ; "ani", "ani"
+ ; "anta", "@ta"
+ ; "plat", "pLa(t)"
+ ]
+
+let () =
+ let () = List.iter tests
+ ~f:(fun (input, expected) ->
+ repeat (Lexing.from_string input) expected)
+ in
+
+ exit !error
+