diff options
author | Sébastien Dailly <sebastien@chimrod.com> | 2021-08-23 14:37:53 +0200 |
---|---|---|
committer | Sébastien Dailly <sebastien@chimrod.com> | 2021-08-23 14:37:53 +0200 |
commit | 546afdcf2148087f3a90b69c23ea756550f64433 (patch) | |
tree | ac56c71393aacf0fade729e98eeecb1e87a88534 |
Initial commit
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | dune-project | 15 | ||||
-rw-r--r-- | src/bin/dune | 6 | ||||
-rw-r--r-- | src/bin/transcriptor.ml | 23 | ||||
-rw-r--r-- | src/lib/.mly | 24 | ||||
-rw-r--r-- | src/lib/dune | 17 | ||||
-rw-r--r-- | src/lib/lexer.mll | 64 | ||||
-rw-r--r-- | src/lib/parser.mly | 118 | ||||
-rw-r--r-- | src/lib/process.ml | 115 | ||||
-rw-r--r-- | src/lib/reader.ml | 37 | ||||
-rw-r--r-- | src/lib/sounds.ml | 142 | ||||
-rw-r--r-- | src/lib/tokens.mly | 46 | ||||
-rw-r--r-- | src/test/dune | 9 | ||||
-rw-r--r-- | src/test/test.ml | 66 |
14 files changed, 687 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0e365db --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +all: + dune build + +test: + dune runtest src/test diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..fddf7a5 --- /dev/null +++ b/dune-project @@ -0,0 +1,15 @@ +(lang dune 2.9) +(name transcriptor) +(generate_opam_files true) + +(using menhir 2.0) + + +(package + (name transcriptor) + (synopsis "Text transcriptor") + (depends + (menhirLib (>= 20210419)) + ) + ) + diff --git a/src/bin/dune b/src/bin/dune new file mode 100644 index 0000000..5d49b1d --- /dev/null +++ b/src/bin/dune @@ -0,0 +1,6 @@ +(executables + (names + transcriptor + ) + (libraries translator) + ) diff --git a/src/bin/transcriptor.ml b/src/bin/transcriptor.ml new file mode 100644 index 0000000..2378c5d --- /dev/null +++ b/src/bin/transcriptor.ml @@ -0,0 +1,23 @@ +module T = Translator +module P = T.Parser + +module Parser = P.Make(T.Sounds) +module I = Parser.MenhirInterpreter + +let process (optional_line : string option) = + match optional_line with + | None -> () + | Some line -> + match T.Reader.process line with + | Ok response -> print_endline response + | Error err -> print_endline err + +let rec repeat channel = + (* Attempt to read one line. *) + let optional_line, continue = T.Lexer.line channel in + process optional_line; + if continue then + repeat channel + +let () = + repeat (Lexing.from_channel stdin) diff --git a/src/lib/.mly b/src/lib/.mly new file mode 100644 index 0000000..464a065 --- /dev/null +++ b/src/lib/.mly @@ -0,0 +1,24 @@ +%start<'a T.t List.t> main + + +initial: + | { } + +medium: + | { } + +ending + | {} + +word: + | initial medium* ending + + +main: + | {} + | main word Spaces+ + { + + + } + diff --git a/src/lib/dune b/src/lib/dune new file mode 100644 index 0000000..6693079 --- /dev/null +++ b/src/lib/dune @@ -0,0 +1,17 @@ +(library + (name + translator + ) + (libraries menhirLib) + ) + +(menhir + (modules tokens) + (flags --only-tokens) ) + +(menhir + (modules tokens parser) + (merge_into parser) + (flags --external-tokens Tokens --table --explain --dump) ) + +(ocamllex lexer) diff --git a/src/lib/lexer.mll b/src/lib/lexer.mll new file mode 100644 index 0000000..236e353 --- /dev/null +++ b/src/lib/lexer.mll @@ -0,0 +1,64 @@ +{ + + open Tokens + + exception Error of string + +} + +rule letter = parse +| '|' { Sep } +| 'a' { A } +| 'b' { B } +| 'c' { C } +| 'd' { D } +| 'e' { E } +| '\233' { E_ACUTE } +| '\xC3' '\xA9' { E_ACUTE } +| 'f' { F } +| 'g' { G } +| "gu" { G } +| 'h' { H } +| 'i' { I } +| 'j' { J } +| 'k' { K } +| 'l' { L } +| 'm' { M } +| "mm" { M } +| 'n' { N } +| "nn" { N } +| 'o' { O } +| 'p' { P } +| "ph" { F } +| 'q' { Q } +| "qu" { K } +| 'r' { R } +| 'u' { U } +| 's' { S } +| "ss" { SS } +| 't' { T } +| 'u' { U } +| 'v' { V } +| 'w' { W } +| 'x' { X } +| 'y' { Y } +| 'z' { Z } +| ' ' { Space } +| '\n' { EOL } +| eof { EOL } + +(* This rule looks for a single line, terminated with '\n' or eof. + It returns a pair of an optional string (the line that was found) + and a Boolean flag (false if eof was reached). *) + +and line = parse +| ([^'\n']* '\n') as line + (* Normal case: one line, no eof. *) + { Some line, true } +| eof + (* Normal case: no data, eof. *) + { None, false } +| ([^'\n']+ as line) eof + (* Special case: some data but missing '\n', then eof. + Consider this as the last line, and add the missing '\n'. *) + { Some (line), false } diff --git a/src/lib/parser.mly b/src/lib/parser.mly new file mode 100644 index 0000000..acb8e25 --- /dev/null +++ b/src/lib/parser.mly @@ -0,0 +1,118 @@ +(* + +See [1] for the theory behind the analysis + + +[1] https://fr.m.wiktionary.org/wiki/Annexe:Prononciation/fran%C3%A7ais#Structure_syllabique + + + *) +%parameter<T:Sounds.T> + + + +%{ + + module P = Process.M(T) + +%} +%start<T.t List.t> main + +%% + +occlusiv: + | P { T.p } + | B { T.b } + + | T { T.t } + | D { T.none } + + | C { T.k } + | K { T.k } + | G { T.none } + +fricativ: + | S { T.s () } + | SS { T.s () } + + | F { T.f } + + | C H { T.ch () } + +obstruent: + | occlusiv { $1 } + | fricativ { $1 } + +liquid: + | L { T.l () } + | R { T.r () } + +nasal: + | N { T.n () } + +opening_consonant: + | occlusiv { $1, None } + | fricativ { $1, None } + | nasal { $1, None } + | liquid { $1, None } + | obstruent liquid { $1, Some $2 } + | occlusiv fricativ { $1, Some $2 } + +(* Each voyel as two associated sounds, depending there is a followng sound or + not *) +voyels: + | A { T.a `Opened , T.a `Opened } + | A I { T.e `Opened, T.e `Opened } + | I { T.i `Opened , T.i `Opened } + | E { T.schwa () , T.schwa () } + | E_ACUTE E? { T.e `Closed , T.e `Closed } + | E U { T.eu `Opened , T.eu `Opened } + | O { T.o `Opened , T.o `Opened } + +nasal_voyels: + | A N { T.a' () , T.a' () } + %prec Low + +ending_consonant: + | S { Some (T.s ()) } + | T { None } + | R { Some (T.r ()) } + | nasal { Some $1 } + +ending_word: + | X { Some (T.muted (T.s ())) } + | S { Some (T.muted (T.s ())) } + | R { Some (T.muted (T.r ())) } + | T { Some (T.muted T.t) } + +consonant_group: + | opening_consonant + { + { ending = None + ; opening = [ fst $1 ] + ; following = snd $1 } + } + | ending_consonant + opening_consonant + { + { ending = Some $1 + ; opening = [ fst $2 ] + ; following = snd $2 } + } + +syllable: + | c = consonant_group? + v = voyels + { (v, c) } + + +syllables: + | { [] } + | ss = syllables s = syllable { s::ss } + + +word: + | syllables ending_word? EOL { P.rebuild $2 $1 } + +main: + | word { $1 } diff --git a/src/lib/process.ml b/src/lib/process.ml new file mode 100644 index 0000000..9bfd45d --- /dev/null +++ b/src/lib/process.ml @@ -0,0 +1,115 @@ +open StdLabels +module M(T:Sounds.T) = struct + + type voyel = (T.t * T.t ) + + type consonants = + { ending : T.t option option + ; opening : T.t list + ; following : T.t option } + + type group = voyel * consonants option + + type modifier = (group * T.t option option) -> (group * T.t option option) + + (** Apply all the modifiers to the syllabic group in order to correct the + relation between the elements + + This is just a fold_left list, and the order matter : for example + nasalisation shall be applied after the S vocalisation + + *) + let apply_modifiers + : group * T.t option option -> modifier list -> group * T.t option option + = fun e m -> + List.fold_left m + ~init:e + ~f:(fun e f -> f e) + + (** The Nasal modifier transform a voyel followed by N and a consonant + into a nasal voyel. + + Does this min that nasal voyel are not a distinct element, but just a + merge from two distinct elements ? *) + let nasal + : modifier + = fun init -> + let (((v1, v2), c) , ending) = init in + let ending = Option.bind ending (fun x -> x) in + match ending with + | None -> init + | Some ending -> + match T.is_nasal ending with + | false -> init + | true -> + (* Remove the ending consonant, and transform the voyel into + the nasal form *) + ( ( (T.nasal v1, T.nasal v2) + , c ) + , None ) + + let vocalize_s + : modifier + = fun init -> + let (((v1, v2), c) , ending) = init in + + match c with + | None -> init + | Some op -> match op.opening, op.ending with + | hd::[], None when hd = T.s () -> + let c = Some { op with opening = [T.z ()] } in + (((v1, v2), c) , ending) + | _ -> init + + let rec _rebuild ~(m:modifier list) acc ending_consonant : group list -> T.t list + = function + | [] -> acc + | hd::tl -> + + let modifier_ = vocalize_s :: nasal :: m in + let (voyel, consonants), ending_consonant = + apply_modifiers + (hd, ending_consonant) + modifier_ in + + (* Add the last consonant and the voyel *) + let m, acc = match ending_consonant with + | None -> modifier_, (snd voyel)::acc + | Some s -> + let default = modifier_, (fst voyel) :: acc in + match s with + | None -> default + | Some s -> + + modifier_, (fst voyel) :: s::acc in + + match consonants with + | None -> _rebuild ~m acc None tl + | Some {ending; opening; following} -> + + let acc = match following with + | None -> acc + | Some s -> s::acc in + + match opening with + | [] ->_rebuild ~m acc ending tl + | opening -> _rebuild ~m (opening @ acc) ending tl + + (** Rebuild the list in the normal order + + The voyels have to be choosen, depending either they are followed by a + consonant or not + + Some consonants may be changed depending of the following voyel + + The list has to be reversed + + and so one + + *) + let rebuild + : T.t option option -> group list -> T.t list + = fun ending elems -> + _rebuild ~m:[] [] ending elems + +end diff --git a/src/lib/reader.ml b/src/lib/reader.ml new file mode 100644 index 0000000..4b123d5 --- /dev/null +++ b/src/lib/reader.ml @@ -0,0 +1,37 @@ +open StdLabels + +module P = Parser +module Parser = P.Make(Sounds) +module I = Parser.MenhirInterpreter + +let sound_to_string + : Sounds.t list -> string + = fun t -> + let buff = Buffer.create 16 in + List.iter t + ~f:(fun f -> Buffer.add_string buff f.Sounds.repr); + Buffer.contents buff + +let succeed (res : Sounds.t list) = + Ok (sound_to_string res) + +let fail lexbuf (_ : 'a I.checkpoint) = + Error ( + Printf.sprintf + "At offset %d: syntax error." + (Lexing.lexeme_start lexbuf) + ) + +(* The parser has suspended itself because of a syntax error. Stop. *) + +let loop lexbuf result = + let supplier = I.lexer_lexbuf_to_supplier Lexer.letter lexbuf in + I.loop_handle succeed (fail lexbuf) supplier result + +let process (line : string) = + let lexbuf = Lexing.from_string line in + try + loop lexbuf (Parser.Incremental.main lexbuf.lex_curr_p) + with + | Lexer.Error msg -> Error msg + diff --git a/src/lib/sounds.ml b/src/lib/sounds.ml new file mode 100644 index 0000000..ec1ddf2 --- /dev/null +++ b/src/lib/sounds.ml @@ -0,0 +1,142 @@ +module type T = sig + + type t + val muted : t -> t + + val a : [`Closed | `Opened] -> t + val e : [`Closed | `Opened] -> t + val eu : [`Closed | `Opened] -> t + + val o : [`Closed | `Opened] -> t + val schwa : unit -> t + val i : [`Closed | `Opened] -> t + + + val nasal: t -> t + + val none: t + + val p: t + val b: t + val t: t + val k: t + val f: t + val s: unit -> t + val ch: unit -> t + val z: unit -> t + + val n: unit -> t + + val r: unit -> t + val l: unit -> t + + val is_voyel : t -> bool + val is_nasal : t -> bool + +end + +module T = struct + type kind = + | None + | Voyel + + type t = + { repr : string + ; muted : bool + ; kind : kind + ; nasal : bool + } +end + +module Repr = struct + + let a = "a" + and a_nasal = "@" + + and o_nasal = "§" + +end + +module S = struct + + include T + + let is_voyel t = t.kind = Voyel + let is_nasal t = t.nasal + + let none = + { repr = "." + ; muted = false + ; kind = None + ; nasal = false } + + let nasal t = + match t.repr with + | "a" -> { t with repr = Repr.a_nasal ; nasal = true } + | "o" -> { t with repr = Repr.o_nasal ; nasal = true } + | _ -> t + + let muted f = + { none with + repr = "(" ^ f.repr ^ ")" + ; muted = true } + + let a _ = + { none with repr = Repr.a } + + let e = function + | `Closed -> { none with repr = "e" } + | `Opened -> { none with repr = "E" } + + let eu = function + | `Closed -> { none with repr = "2" } + | `Opened -> { none with repr = "9" } + + + let schwa () = + { none with repr = "°" } + + let o _ = + { none with repr = "o" } + + let i _ = + { none with repr = "i" } + + let p = + { none with repr = "p" } + + let b = + { none with repr = "b" } + + let t = + { none with repr = "t" } + + let k = + { none with repr = "k" } + + let f = + { none with repr = "f" } + + let s () = + { none with repr = "s" } + + let ch () = + { none with repr = "S" } + + let z () = + { none with repr = "z" } + + let n () = + { none with + repr = "n" + ; nasal = true } + + let l () = + { none with repr = "L" } + + let r () = + { none with repr = "R" } + +end + +include S diff --git a/src/lib/tokens.mly b/src/lib/tokens.mly new file mode 100644 index 0000000..428c744 --- /dev/null +++ b/src/lib/tokens.mly @@ -0,0 +1,46 @@ +%token Sep + +%token A +%token B +%token C +%token D +%token E +%token E_ACUTE +%token F +%token G +%token H +%token I +%token J +%token K +%token L +%token M +%token N +%token O +%token Q +%token P +%token R +%token S +%token SS +%token T +%token U +%token V +%token W +%token X +%token Y +%token Z +%token Space +%token EOL + +%nonassoc Low + +%right A E E_ACUTE I O U + +%right C H J Q V W X Y Z +%right P B T D K G +%right S SS F + +%right N M +%right L R +%right High + +%% diff --git a/src/test/dune b/src/test/dune new file mode 100644 index 0000000..f8dbc72 --- /dev/null +++ b/src/test/dune @@ -0,0 +1,9 @@ +(test + (name + test + ) + (libraries translator) + ) + + + diff --git a/src/test/test.ml b/src/test/test.ml new file mode 100644 index 0000000..ca74a9e --- /dev/null +++ b/src/test/test.ml @@ -0,0 +1,66 @@ +open StdLabels + +module T = Translator +let error = ref 0 + +let process (optional_line : string option) expected = + match optional_line with + | None -> () + | Some line -> + match T.Reader.process line with + | Error result -> + error := 1; + Printf.fprintf stdout + "%s : %s\n%!" + line + result + | Ok response -> + if String.equal response expected then + () + else ( + error := 1; + (Printf.fprintf stdout "%s : got %s / %s expected\n%!" + line + response + expected) + ) + + +let rec repeat input expected = + (* Attempt to read one line. *) + let optional_line, continue = T.Lexer.line input in + process optional_line expected; + if continue then + repeat input expected + +let tests = + [ "abaca", "abaka" + ; "abaissa", "abEsa" + ; "abaissait", "abEsE(t)" + ; "abaissant", "abEs@(t)" + ; "abaissées", "abEse(s)" + ; "abaissera", "abEs°Ra" + ; "achat", "aSa(t)" + ; "astiqué", "astike" + ; "casait", "kazE(t)" + ; "cassait", "kasE(t)" + ; "chanci", "S@si" + ; "chat", "Sa(t)" + ; "chipant", "Sip@(t)" + ; "pacha", "paSa" + ; "péché", "peSe" + ; "persai", "pERse" + ; "asia", "azia" + ; "ani", "ani" + ; "anta", "@ta" + ; "plat", "pLa(t)" + ] + +let () = + let () = List.iter tests + ~f:(fun (input, expected) -> + repeat (Lexing.from_string input) expected) + in + + exit !error + |