diff options
105 files changed, 8226 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37d870f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +_build/ +examples/*.csv +examples/*.sqlite +*.opam diff --git a/.ocamlformat b/.ocamlformat new file mode 100644 index 0000000..72fc0fd --- /dev/null +++ b/.ocamlformat @@ -0,0 +1,11 @@ +profile = default
+parens-tuple = always
+sequence-style = terminator
+
+single-case = sparse
+type-decl = sparse
+break-separators = after
+break-cases = fit-or-vertical
+
+# Keep the begin/end blocks
+exp-grouping=preserve
diff --git a/bin/dune b/bin/dune new file mode 100755 index 0000000..fad168d --- /dev/null +++ b/bin/dune @@ -0,0 +1,38 @@ +(env
+ (dev
+ (flags (:standard -warn-error -A))
+ )
+ (release
+ (ocamlopt_flags (-O3)))
+)
+
+(executable
+ (name importer)
+ (libraries
+ csv
+ lwt
+ lwt.unix
+ otoml
+ yojson
+ tools
+ helpers
+ importConf
+ importAnalyser
+ importContainers
+ importDataTypes
+ importCSV
+ importErrors
+ importExpression
+ importFileHandler
+ importSQL
+ )
+ (link_flags (:standard))
+(preprocess (pps ppx_yojson_conv))
+)
+
+(install
+ (files importer.exe)
+ (section bin)
+ (package importer))
+
+(dirs :standard \ examples)
diff --git a/bin/importer.ml b/bin/importer.ml new file mode 100644 index 0000000..f737a46 --- /dev/null +++ b/bin/importer.ml @@ -0,0 +1,316 @@ +open StdLabels +module Analyse = ImportAnalyser.Dependency +module Headers = ImportAnalyser.Headers +module Db = ImportSQL.Db +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +let ( let* ) = Result.bind + +let exists source = + match Sys.file_exists source with + | true -> source + | false -> + prerr_endline + @@ String.concat ~sep:" " [ "The file"; source; "does not exists" ]; + exit 1 + +let creation_date file = + try + let stat = Unix.stat file in + Some stat.Unix.st_mtime + with + | Unix.Unix_error _ -> None + +module Args = struct + type arguments = { + configuration : ImportConf.Syntax.t; + conf_name : string; + bom : bool; + print_conf : bool; + mapping_date : float option; + } + + let load_conf : string -> ImportConf.Syntax.t = + fun file -> + match Filename.extension file with + | ".json" -> ( + let configuration_file = Yojson.Safe.from_file (exists file) in + try ImportConf.t_of_yojson configuration_file with + | e -> + print_endline @@ ImportErrors.repr_error e; + exit 1) + | _ -> ( + let (conf : (Db.Syntax.t, string) result) = + let* configuration_file = + try Ok (Otoml.Parser.from_file (exists file)) with + | Otoml.Parse_error (position, message) -> + let format_position () = function + | None -> "" + | Some (line, col) -> + Printf.sprintf "At line %d, col %d: " line col + in + + let error_msg = + Printf.sprintf "%aError %s" format_position position message + in + + Error error_msg + in + ImportConf.t_of_toml configuration_file + in + match conf with + | Error e -> + prerr_endline e; + exit 1 + | Ok e -> e) + + let load () = + let conf = ref ("", ImportConf.dummy_conf) + and bom = ref true + and usage = "importer [--conf configuration.toml]" + and print_conf = ref false in + + let annon_fun _filename = + print_endline usage; + exit 1 + and set_conf file = conf := (file, load_conf file) in + let speclist = + [ + ( "--version", + Arg.Unit + (fun () -> + Printf.printf "Version %s\n" Tools.Git_hash.revision; + exit 0), + "\tDisplay the version of the application and exit" ); + ("--conf", Arg.String set_conf, "Configuration file"); + ("-c", Arg.String set_conf, "Configuration file"); + ("--no-bom", Arg.Clear bom, "Do not insert a BOM in the CSV"); + ( "--print-conf", + Arg.Set print_conf, + "Reformat the configuration file and exit" ); + ] + in + let () = Arg.parse speclist annon_fun usage in + { + configuration = snd !conf; + bom = !bom; + conf_name = fst !conf; + print_conf = !print_conf; + mapping_date = None; + } +end + +(** Print the result from the query. + + Each value is given with the associated expression in the configuration, + the function is expected to convert the result into string in order to + include the content in the output CSV. + *) +let printer : Path.t ImportExpression.T.t * ImportCSV.DataType.t -> string = + fun (column, value) -> + ignore column; + ImportCSV.DataType.to_string value + +let bom = "\xEF\xBB\xBF" + +let process_table : + 'a Db.t -> + string -> + Csv.out_channel Lazy.t -> + Args.arguments -> + string array Headers.SheeetMap.t -> + Analyse.t -> + string array Headers.SheeetMap.t = + fun db dirname log_error conf map mapping -> + let source = Analyse.table mapping in + (* Load all the element in the database *) + let file = Filename.concat dirname source.file in + + let file_date = creation_date file in + let recent = + match (conf.mapping_date, file_date) with + | Some r, Some t -> r > t + | _, _ -> false + in + + match (recent, ImportSQL.Db.check_table_schema db mapping) with + | true, Ok true -> ( + (* If the data extraction is more recent than the sheet, and if the + configuration did not changed for this table, we do not reload the + table. *) + match ImportSQL.Db.query_headers db source with + | Ok v -> + let text_headers = Array.map v ~f:ImportCSV.DataType.to_string in + Headers.SheeetMap.add source text_headers map + | Error _ -> map) + | _ -> + Printf.printf "Loading document %s %!" source.name; + let headers_opt = + let extension = String.lowercase_ascii (Filename.extension file) in + match extension with + | ".xlsx" -> + Lwt_main.run + @@ ImportFileHandler.Xlsx2sql.importInDatable ~dirname + ~conf:conf.configuration ~log_error mapping db + | ".csv" -> + Lwt_main.run + @@ ImportFileHandler.Csv2sql.importInDatable ~dirname + ~conf:conf.configuration ~log_error mapping db + | _ -> raise (ImportErrors.Unknown_source extension) + in + Helpers.Console.close_cursor (); + + (* Get the headers *) + let headers = + match headers_opt with + | None -> map + | Some v -> + let text_headers = Array.map v ~f:ImportCSV.DataType.to_string in + Headers.SheeetMap.add source text_headers map + in + + (* For each external check if the values are loaded *) + let dependancies = + ImportConf.get_dependancies_for_table conf.configuration source + in + List.iter dependancies ~f:(fun ext -> + match ext.ImportConf.Syntax.allow_missing with + | true -> () + | false -> ( + Printf.printf "Checking dependancies for %s %!" + ext.ImportConf.Syntax.target.ImportDataTypes.Table.name; + try + ignore + @@ Db.check_foreign db conf.configuration ext ~f:(fun values -> + Helpers.Console.update_cursor (); + + let row = + match snd (Array.get values 0) with + | ImportCSV.DataType.Integer i -> i + | _ -> -1 + and value = snd (Array.get values 1) in + let error = + ImportErrors. + { + source; + sheet = source.Table.tab; + row; + value; + target = Some ext.ImportConf.Syntax.target; + exn = + Failure + (Printf.sprintf "Key '%s' not found" + (CSV.DataType.to_string value)); + } + in + + ImportErrors.output_error log_error error); + Helpers.Console.close_cursor () + with + | Sqlite3.Error _ -> + (* We can have errors here if we have cycles in the + dependencies, but it’s OK at this step.*) + ())); + headers + +let () = + let conf = Args.load () in + + (* Global configuration variables *) + let dirname = Filename.dirname conf.conf_name in + let basename = Filename.basename conf.conf_name in + let prefix = Filename.remove_extension basename in + + (* Analyse the configuration *) + let process_order = + try Analyse.get_process_order conf.configuration with + | e -> + prerr_endline @@ ImportErrors.repr_error e; + exit 1 + in + + (* With the printconf option, we do not need to open any file *) + if conf.print_conf then ( + let toml = ImportConf.Syntax.repr conf.configuration in + Otoml.Printer.to_channel ~collapse_tables:true stdout toml; + exit 0); + + let sqlfile = Filename.concat dirname (prefix ^ ".sqlite") in + let conf = { conf with mapping_date = creation_date sqlfile } in + + (* Ensure that all the files exists *) + List.iter process_order ~f:(fun (mapping : Analyse.t) -> + let source = Analyse.table mapping in + (* First, check *) + if not (Sys.file_exists source.Table.file) then begin + ignore @@ exists @@ Filename.concat dirname source.Table.file + end); + + (* The configuration is loaded and valid, we create the errors log file *) + let log_error = ImportErrors.log ~with_bom:conf.bom prefix dirname in + + (* Now check if we can open the output file. Opening the file "truncate" it. + I do not want to delete the file content before I am sure that I can + process it. This is why the file is opened after reading the configuration + file although we do not need the configuration file for it. + *) + let out_channel = + try + Out_channel.open_bin (Filename.concat dirname (String.cat prefix ".csv")) + with + | Sys_error e -> + prerr_endline e; + exit 1 + in + + Fun.protect + ~finally:(fun () -> + Out_channel.flush out_channel; + Out_channel.close_noerr out_channel) + (fun () -> + (* Add the BOM to the CSV *) + if conf.bom then output_string out_channel bom; + + let out_csv = + Csv.to_channel ~separator:';' ~excel_tricks:false out_channel + in + + (* Create the database *) + ignore + @@ Db.with_db sqlfile (fun db -> + let headers = + List.fold_left process_order ~init:Headers.SheeetMap.empty + ~f:(process_table db dirname log_error conf) + in + + let first_line = Headers.columns conf.configuration headers in + Csv.output_record out_csv first_line; + + (* Run the query *) + ignore @@ Db.create_view db conf.configuration; + Printf.printf "Extracting results %!"; + match + Db.query + ~f:(fun v -> + let arr = Array.to_seq v |> Seq.map printer |> List.of_seq in + + Helpers.Console.update_cursor (); + Csv.output_record out_csv arr) + db conf.configuration + with + | Ok () -> + Printf.printf "%c[?25h%c[1D%c[0K\n%!" (char_of_int 27) + (char_of_int 27) (char_of_int 27); + Ok () + | Error e -> + Printf.printf "%c[?25h%c[1D%c[0K\n%!" (char_of_int 27) + (char_of_int 27) (char_of_int 27); + print_endline @@ ImportErrors.repr_error e; + + Ok ())); + + (* Close the error file *) + match Lazy.is_val log_error with + | false -> () + | true -> Csv.close_out (Lazy.force log_error) diff --git a/dune-project b/dune-project new file mode 100755 index 0000000..ca435e6 --- /dev/null +++ b/dune-project @@ -0,0 +1,27 @@ +(lang dune 3.7) +(using menhir 2.0) + +(generate_opam_files true) +(implicit_transitive_deps true) +(authors "Sébastien Dailly") +(maintainers "Sébastien Dailly") + +(package + (name importer) + (depends + (ocaml (>= 4.14.0)) + (sqlite3 (>= 5.1.0)) + (SZXX (and (>= 2.1.0) (<= 4.0.0))) + (csv (>= 2.4)) + (csv-lwt (>= 2.4)) + (decoders (>= 1.0.0)) + (ppx_yojson_conv (>= v0.14.0)) + (ounit (>= 2.2.6)) + (otoml (>= 1.0.1)) + (re (>= 1.10.4)) + (ppx_deriving (>= 5.2.1)) + (tsort (>= 2.1.0)) + (calendar (>= 3.0.0)) + ) + (allow_empty) +) diff --git a/examples/example_csv.toml b/examples/example_csv.toml new file mode 100644 index 0000000..1374515 --- /dev/null +++ b/examples/example_csv.toml @@ -0,0 +1,34 @@ +version = 1 + +[source] + file = "importer.csv" + name = "source" + +[externals.target] + intern_key = ":source.A" + extern_key = ":A" + file = "financial.xlsx" + allow_missing = false + +[externals.a_financial] + intern_key = ":target.A" + extern_key = ":O" + file = "financial.xlsx" + allow_missing = false + +[sheet] + columns = [ + ":A", + "concat(\"-\", :A, :target.E, :B)", + ":C", + """counter( + [:C], + [:A])""", + ":E", + "match(\"\\(..\\)\", :B)", + ":D", + "counter([:D] ,[:A])" + ] + filters = [] + sort = [] + uniq = [] diff --git a/examples/financial.xlsx b/examples/financial.xlsx Binary files differnew file mode 100644 index 0000000..6ed8522 --- /dev/null +++ b/examples/financial.xlsx diff --git a/examples/importer.toml b/examples/importer.toml new file mode 100644 index 0000000..4be6618 --- /dev/null +++ b/examples/importer.toml @@ -0,0 +1,44 @@ +version = 1 + +[source] + file = "financial.xlsx" + name = "source" + +[externals.target] + intern_key = ":source.A ^ '-suffix'" + extern_key = ":A ^ '-suffix'" + file = "financial.xlsx" + allow_missing = false + +[externals.a_financial] + intern_key = ":target.A" + extern_key = ":O" # This key is here to generate errors + file = "financial.xlsx" + allow_missing = false + +[sheet] + columns = [ + ":target.A ^ '\\''", # Ensure the quote is escaped before sending to the sql engine + "concat('-', :A, :target.E, :B)", + ":C", + "counter([:C], [:A])", + "sum(:F, [:B, :C, :D], [:B])", + "int(1) = counter([:C], [:A])", + ":E", "match('(..)', :B)", + ":D", + "counter([:D], [:A])", + "year(:N)", + # Apply a function on the year + """:Q + - + # We only keep the years after Y2K + 2000""" + + ] + filters = [ + + "1 = counter( [:B], [:A])", + + ] + sort = [] + uniq = [] diff --git a/examples/importer_groupe.toml b/examples/importer_groupe.toml new file mode 100644 index 0000000..eb2f7e6 --- /dev/null +++ b/examples/importer_groupe.toml @@ -0,0 +1,28 @@ +# Cet exemple permet de montrer la combinaison de fonction de groupe avec des +# filtres. + +# On recherche ici le plus grand nombre d’unitées vendues en France. +# Pour que le résultat soit conforme à l’attendu, il faut que l’application +# commence par filtrer les lignes qui concernent la France, avant de chercher +# la ligne contenant le plus grand nombre d’unités. + +version = 1 + +[source] + file = "financial.xlsx" + name = "source" + +[sheet] + columns = [ + ":A", + ":C", + ":D", + ":N", + + ] + filters = [ + + ":C = 'France'", + ":A = max(:A, [:D], [:F])", + + ] diff --git a/lib/analysers/dependency.ml b/lib/analysers/dependency.ml new file mode 100644 index 0000000..e81cc49 --- /dev/null +++ b/lib/analysers/dependency.ml @@ -0,0 +1,256 @@ +open StdLabels +module IntSet = ImportContainers.IntSet +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module Expression = ImportExpression.T + +(* + Internally, the dependency chain is represented as a graph. + + Each file to process (csv or xlsx) is marked as a node, The link between + each node cannot be represented in the graph because each index can imply + multiple tables together (for exemple [join(source.A, other.B)] ). So the + graph is only used in order to get the import process order, and each + information is keeped in a separate table. + +*) + +type deps = (ImportContainers.Source.t * ImportContainers.Source.t list) list + +type key = { + name : string; + expression : Path.column Expression.t; + columns : ImportContainers.IntSet.t Lazy.t; +} + +type t = { + table : Table.t; + columns : IntSet.t; + keys : key list; +} + +let table t = t.table +let columns t = t.columns +let keys t = t.keys + +type build_map = t ImportContainers.Externals.t + +(* The expression can be qualified with full path (when we are in the column + definition) or only with a column (in a binding with an externanl table). + + This type is here to extract the correct information, according to the type + we are dealing with : + + - [to_mapping] allow to pick the right Set in which we need to add the + column pointed by the path (this can be [keys] or [columns] + - [of_path] is need to extract the qualified source from any kind of path. +*) +type 'a expression_extractor = { + to_mapping : t -> Path.column -> t; + of_path : 'a -> string option * Path.column; +} + +(** [add_path_in_map f parent path ] Extract the column from element [path] and + process the column in the function [f] + + The [path] is abstract, but the function [f.of_path] can extract the needed + elements in order to add it in the mapping. + + The function may raise [Unknown_source] if the the path describe an unknown + table. *) +let add_path_in_map : + f:'a expression_extractor -> conf:Syntax.t -> 'a -> build_map -> build_map = + fun ~f ~conf path map -> + let table_source, column = f.of_path path in + let table = + try ImportConf.get_table_for_name conf table_source with + | Not_found -> raise (ImportErrors.Unknown_source (Option.get table_source)) + in + + ImportContainers.Externals.update map + ~key:(ImportContainers.KeyName.from_table table) ~f:(fun v -> + let mapping = + match v with + | None -> raise (ImportErrors.Unknown_source table.name) + | Some mapping -> mapping + in + + Some (f.to_mapping mapping column)) + +let add_expression_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t -> + build_map -> + build_map = + fun ~f ~conf expr map -> + Expression.fold_values expr ~init:map ~f:(fun map p -> + add_path_in_map ~f ~conf p map) + +let add_columns_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t list -> + build_map -> + build_map = + fun ~f ~conf columns map -> + let columns = + List.fold_left columns ~init:map ~f:(fun map expression -> + let new_map = add_expression_in_map ~f ~conf expression map in + new_map) + in + columns + +(* [add_dependancies ~conf source map path] + add the dependancy from the table [source] to another one after analysing the + expression and extracting the path contained inside. + + This function is called for each path declared inside the expression. *) +let add_dependancies : conf:Syntax.t -> Syntax.extern -> deps -> Path.t -> deps + = + fun ~conf extern graph path -> + let source_table = ImportConf.get_table_for_name conf path.Path.alias in + + let source = ImportContainers.Source.from_table source_table in + let target = ImportContainers.Source.from_table extern.target in + + match ImportContainers.Source.equal target source with + | true -> graph + | _ -> (target, [ source ]) :: graph + +let add_external_in_map : + conf:Syntax.t -> Syntax.extern -> build_map * deps -> build_map * deps = + fun ~conf extern (map, graph) -> + let dest = ImportContainers.KeyName.from_table extern.target in + (* Pre-check that every source is already declared in the configuration. *) + let _ = + Expression.fold_values extern.intern_key ~init:() ~f:(fun () path -> + try + let _ = ImportConf.get_table_for_name conf path.Path.alias in + () + with + | Not_found -> ( + match path.alias with + | Some table -> raise (ImportErrors.Unknown_source table) + | None -> + (* This is very unlikely. This error would be raised if we have + no source for this import *) + let root = conf.source in + raise (ImportErrors.Unknown_source root.name))) + in + + (* Create the new key with all the expression and all the columns inside it *) + let new_key = + { + name = extern.target.Table.name; + expression = extern.extern_key; + columns = + lazy + (Expression.fold_values extern.extern_key + ~f:(fun acc k -> ImportContainers.IntSet.add k acc) + ~init:ImportContainers.IntSet.empty); + } + in + let build_map = + ImportContainers.Externals.update map ~key:dest ~f:(function + | None -> + (* Create the entry for the external if it does not exists *) + Some + { + table = extern.target; + columns = IntSet.empty; + keys = [ new_key ]; + } + | Some mapping -> + (* Or update the existing one with the key we just created *) + Some { mapping with keys = new_key :: mapping.keys }) + in + let graph = + Expression.fold_values extern.intern_key ~init:graph + ~f:(add_dependancies ~conf extern) + in + let build_map = + add_expression_in_map extern.intern_key build_map ~conf + ~f: + { + of_path = + (fun Path.{ alias; column } -> + let table = ImportConf.get_table_for_name conf alias in + (Some table.name, column)); + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + } + in + (build_map, graph) + +let mapper = + { + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + of_path = (fun ({ alias; column } : Path.t) -> (alias, column)); + } + +let get_mapping : Syntax.t -> build_map * deps = + fun conf -> + let root = ImportContainers.Source.from_table (ImportConf.root_table conf) + and root' = + ImportContainers.KeyName.from_table (ImportConf.root_table conf) + in + let graph = [ (root, []) ] in + + (* For each external declared in the configuration file, add the columns to + query *) + let init = + ( ImportContainers.Externals.singleton root' + { + table = ImportConf.root_table conf; + columns = IntSet.empty; + keys = []; + }, + graph ) + in + let map, graph = + List.fold_left conf.externals ~init ~f:(fun map extern -> + add_external_in_map ~conf extern map) + in + + (* Now we don’t bother anymore with the graph and it’s dependency, we just + collect all the columns in the differents expressions *) + let map = + map + |> add_columns_in_map ~conf ~f:mapper conf.columns + |> add_columns_in_map ~conf ~f:mapper conf.sort + |> add_columns_in_map ~conf ~f:mapper conf.filters + |> add_columns_in_map ~conf ~f:mapper conf.uniq + in + (map, graph) + +let get_process_order : Syntax.t -> t list = + fun map -> + let map, graph = get_mapping map in + + match Tsort.sort graph with + | Tsort.ErrorCycle l -> + let name_of_key k = + ImportContainers.(Externals.find (Source.name k) map).table.name + in + raise (ImportErrors.Cycle (List.map ~f:name_of_key l)) + | Sorted elements -> + (* It’s OK, we know there is no cycles now, we can extract the files to + load from this list. + *) + List.filter_map elements ~f:(fun v -> + ImportContainers.Externals.find_opt + (ImportContainers.Source.name v) + map) + (* This list can still have duplicated values, and we have to remove them + still keeping the order. + *) + |> List.fold_left ~init:[] ~f:(fun acc element -> + (* Prevent the same file to beeing loaded twice *) + match List.mem element ~set:acc with + | true -> acc + | false -> element :: acc) diff --git a/lib/analysers/dependency.mli b/lib/analysers/dependency.mli new file mode 100644 index 0000000..c89522a --- /dev/null +++ b/lib/analysers/dependency.mli @@ -0,0 +1,40 @@ +type t + +val get_process_order : ImportConf.Syntax.t -> t list +(** Extract the file list to process, following the identified dependancies. + Try to load first the document which does not required another spreadsheet, + and keep going in the topological order + + Raise [Unknown_source file] if a source is referenced but is not declared. + The order matter : the exception will be raised in a source is referenced + before the declaration (when chaining differents externals) *) + +val table : t -> ImportDataTypes.Table.t +(** Get the table to proceed. *) + +val columns : t -> ImportContainers.IntSet.t +(** A set of columns loaded in this table. Thoses columns may not need + reported in the final export. + + Each column is identified by is index in the Excel document. + + This set does not include the columns used in the keys. They can be + fetched with the [keys] function *) + +type key = { + name : string; (** This is the name of the target table we are pointed to *) + expression : ImportDataTypes.Path.column ImportExpression.T.t; + (** The expression used as key *) + columns : ImportContainers.IntSet.t Lazy.t; + (** The list of columns used in the key. All the columns are referenced + in the expression. We can have many columns used inside a single + key when a function is used (for example for joining multiple + columns into a single key) *) +} +(** This type describe the join key in a table. The name is the refering table + using this key (the key name in the datable is key_"name" ), and the + expression describe how to build the key. *) + +val keys : t -> key list +(** [keys] is the list of columns pointed by another one. They are + considered as join key between the diffrent tables. *) diff --git a/lib/analysers/dune b/lib/analysers/dune new file mode 100755 index 0000000..1bbc30f --- /dev/null +++ b/lib/analysers/dune @@ -0,0 +1,12 @@ +(library
+ (name importAnalyser)
+ (libraries
+ importConf
+ importContainers
+ importCSV
+ importDataTypes
+ importExpression
+ importErrors
+ tsort
+ )
+)
diff --git a/lib/analysers/headers.ml b/lib/analysers/headers.ml new file mode 100644 index 0000000..916dfee --- /dev/null +++ b/lib/analysers/headers.ml @@ -0,0 +1,55 @@ +open StdLabels +module I = ImportConf +module E = ImportExpression.T +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +module SheeetMap = Map.Make (struct + type t = Table.t + + (** We are sure we can’t have the same name for two different table. *) + let compare v1 v2 = String.compare (Table.name v1) (Table.name v2) +end) + +type content = string array + +type t = content SheeetMap.t +(** The map associate a line of headers for each table. + + The header are always in string. *) + +(** Get the headers. The function has to be called after reading each document, + and will reformat the first line with the values from the cell. The + functions will not be evaluated (instead they will be displayed "as is". + + When there is no value for this path, return empty string. + *) +let columns : Syntax.t -> t -> string list = + fun conf t -> + (* We build here a custom printer which search in the array for the column + name. + + This function will be given as argument in the expression printer. *) + let f : Path.t -> Buffer.t -> unit = + fun path b -> + let source = I.get_table_for_name conf path.alias in + + match SheeetMap.find_opt source t with + | None -> () + | Some arr -> ( + try Buffer.add_string b (Array.get arr (path.column - 1)) with + | _ -> + prerr_endline + @@ Printf.sprintf "No header found for :%s.%s" + (Option.value ~default:(I.root_table conf).Table.name + path.alias) + (ImportCSV.Csv.column_to_string path.column)) + in + + List.map conf.Syntax.columns ~f:(fun column -> + let b = Buffer.create 4 in + + ImportExpression.Headers.headers_of_expression b f column; + + Buffer.contents b) diff --git a/lib/analysers/headers.mli b/lib/analysers/headers.mli new file mode 100644 index 0000000..03e384b --- /dev/null +++ b/lib/analysers/headers.mli @@ -0,0 +1,11 @@ +module SheeetMap : Map.S with type key = ImportDataTypes.Table.t + +type t = string array SheeetMap.t + +val columns : ImportConf.Syntax.t -> t -> string list +(** Get the headers. The function has to be called after reading each document, + and will reformat the first line with the values from the cell. The + functions will not be evaluated (instead they will be displayed "as is". + + When there is no value for this path, return empty string. + *) diff --git a/lib/analysers/query.ml b/lib/analysers/query.ml new file mode 100644 index 0000000..7a6dd2a --- /dev/null +++ b/lib/analysers/query.ml @@ -0,0 +1,445 @@ +open StdLabels +module Expression = ImportExpression +module Q = Expression.Query +module Syntax = ImportConf.Syntax +module Table = ImportConf.Table +module Path = ImportConf.Path + +let truncate buffer n = Buffer.truncate buffer (Buffer.length buffer - n) + +(** The module allow to create fragment in the query which keep together the + binderd parameters and the text of the query.contents. + + This is used a lot in order to create the CTE, where you need the create + fragment used both in the main request and partially in the CTE itself. + + The content is mutable and all the functions are returning [unit]. *) +module Chunk = struct + type t = { + b : Buffer.t; + parameters : ImportCSV.DataType.t Queue.t; + } + + let create : unit -> t = + fun () -> { b = Buffer.create 16; parameters = Queue.create () } + + let create' : Buffer.t -> ImportCSV.DataType.t Queue.t -> t = + fun b parameters -> { b; parameters } + + (* Append the element from [tail] at the end of [head] + + Tail is destroyed during the operation. + *) + let append : head:t -> tail:t -> unit = + fun ~head ~tail -> + match Buffer.length tail.b with + | 0 -> () + | _ -> + Buffer.add_buffer head.b tail.b; + Queue.transfer tail.parameters head.parameters; + () + + (** Add a litteral string in the sequence *) + let add_string : t -> string -> unit = fun t v -> Buffer.add_string t.b v + + let copy : t -> t = + fun t -> + let b = Buffer.create 16 and parameters = Queue.copy t.parameters in + Buffer.add_buffer b t.b; + { b; parameters } + + let add_parameters : t -> ImportCSV.DataType.t Seq.t -> unit = + fun t p -> Queue.add_seq t.parameters p +end + +let prepare_key : f:(Format.formatter -> unit) -> Format.formatter -> unit = + fun ~f formatter -> Format.fprintf formatter "rtrim(upper(%t))" f + +(* Collect all the tables pointed by the expression. *) +let pointed_tables : Syntax.t -> 'a Expression.T.t -> (Table.t * string) list = + fun conf expression -> + Expression.T.fold_values expression ~init:[] ~f:(fun acc path -> + let table = ImportConf.get_table_for_name conf path.Path.alias in + let table_name = Table.name table in + (table, table_name) :: acc) + |> List.sort_uniq ~cmp:Stdlib.compare + +(** Represent a column in a safe way in a query *) +let print_column : Table.t -> string -> string = + fun table column -> + String.concat ~sep:"" [ "'"; table.Table.name; "'.'"; column; "'" ] + +let create_table : Dependency.t -> string = + fun mapping -> + let b = Buffer.create 64 in + + Buffer.add_string b "CREATE TABLE '"; + Buffer.add_string b (Table.name (Dependency.table mapping)); + Buffer.add_string b "' (id INTEGER PRIMARY KEY"; + + List.iter (Dependency.keys mapping) ~f:(fun { Dependency.name; _ } -> + Buffer.add_string b ",'key_"; + Buffer.add_string b name; + Buffer.add_string b "'"); + + ImportContainers.IntSet.iter (Dependency.columns mapping) ~f:(fun i -> + Buffer.add_string b ",'col_"; + Buffer.add_string b (string_of_int i); + Buffer.add_string b "'"); + Buffer.add_string b ")"; + + Buffer.contents b + +let show_path : conf:Syntax.t -> Format.formatter -> Path.t -> unit = + fun ~conf buffer { alias; column } -> + let table = ImportConf.get_table_for_name conf alias in + let table_name = table.Table.name in + Format.fprintf buffer "'%s'.col_%d" table_name column + +(** Extract the informations from the dependancies. We get two informations here : + + - the join query in order to load the data from the external column + - the column corresponding to the key in order to identify the missing + links later. + *) +let query_of_external : + conf:Syntax.t -> join_buffer:Chunk.t -> Syntax.extern -> unit = + fun ~conf ~join_buffer external_ -> + let extern_table = Table.name external_.target in + + let formatter = Format.formatter_of_buffer join_buffer.b in + Format.fprintf formatter "\nLEFT JOIN '%s' AS '%s' ON %t = %s" extern_table + external_.target.name + (prepare_key ~f:(fun f -> + let q = + Q.query_of_expression Q.BindParam f (show_path ~conf) + external_.intern_key + in + + Chunk.add_parameters join_buffer (Queue.to_seq q))) + (print_column external_.Syntax.target + ("key_" ^ external_.Syntax.target.name)); + + Format.pp_print_flush formatter () + +(** Create the from part of the query, adding all the reuired externals. + + SQLite is able to optimize the query and do not load the table not used in + the select clause. *) +let create_from_chunck : Syntax.t -> Chunk.t -> unit = + fun conf c -> + Chunk.add_string c "\nFROM '"; + Chunk.add_string c (Table.name conf.source); + Chunk.add_string c "' AS '"; + Chunk.add_string c conf.source.name; + Chunk.add_string c "'"; + + (* Add the externals in the query *) + List.iter conf.externals ~f:(query_of_external ~conf ~join_buffer:c) + +(** Build a CTE query in order to use any group function inside the query. + Return the binded parameters used in the expression. The buffer given in + argument is also modified during the construction. + + If filters is not None, the clauses are added to the CTE. *) +let build_cte : + Syntax.t -> + expression:'a Expression.T.t -> + filters:Chunk.t option -> + Chunk.t = + fun conf ~expression ~filters -> + (* The binded parameters queue will be used later in the full query *) + let cte_chunk = Chunk.create () in + + Chunk.add_string cte_chunk "WITH cte AS (SELECT "; + Chunk.add_string cte_chunk conf.source.name; + Chunk.add_string cte_chunk ".id, "; + + let formatter = Format.formatter_of_buffer cte_chunk.b in + + let p = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) expression + in + Format.pp_print_flush formatter (); + Chunk.add_parameters cte_chunk (Queue.to_seq p); + (* The name is hardcoded here, and used in [Expression.Filters.window] *) + Chunk.add_string cte_chunk " AS group0"; + + let () = create_from_chunck conf cte_chunk in + let () = + match filters with + | None -> () + | Some filters_chunk -> + Chunk.append ~head:cte_chunk ~tail:(Chunk.copy filters_chunk) + in + + Chunk.add_string cte_chunk ")\n"; + cte_chunk + +type filter_evaluation = { + content : Buffer.t; + parameters : ImportCSV.DataType.t Seq.t; + cte : (string * Chunk.t) option; +} +(** Build the filters to apply in the query. We make the difference here + between the predicates to apply directly in the query, and the filters + associated with a group, which are required to be transformed into a CTE +in SQL, and are evaluated before. *) + +(** Evaluate the filters on the query *) +let eval_filters : Syntax.t -> filter_evaluation = + fun conf -> + match conf.filters with + | [] -> + let empty_buffer = Buffer.create 0 in + { content = empty_buffer; parameters = Seq.empty; cte = None } + | filters -> ( + (* Create a new queue in order to accumulate all the parameters to bind. + This filter will be given to both the CTE if any, or reused in the + main query when there is no CTE. + *) + let chunk_filters = Chunk.create () in + Chunk.add_string chunk_filters "\nWHERE "; + + let group = Chunk.create () in + + let with_cte, with_exr = + List.fold_left filters ~init:(None, false) + ~f:(fun (with_cte, with_exr) column -> + (* The function will return an option in second position which is + None when no Group function where found, and Some Expression + otherwise *) + let b = Buffer.create 16 in + + let formatter = Format.formatter_of_buffer b in + let queue, group_found = + Expression.Filters.query_of_expression Q.BindParam formatter + (show_path ~conf) column + in + Format.pp_print_flush formatter (); + let clause = Chunk.create' b queue in + + match (group_found, with_cte) with + | None, _ -> + Chunk.append ~head:chunk_filters ~tail:clause; + Chunk.add_string chunk_filters "\nAND "; + (with_cte, true) + | (Some _ as group'), None -> + (* We have a group here, we do not add it into the + filter_buffer right now. + + This can occur only once, the second one will raise + an error. *) + Chunk.append ~head:group ~tail:clause; + (group', with_exr) + | Some _, Some _ -> raise ImportErrors.MisplacedWindow) + in + + match with_cte with + | None -> + let content = chunk_filters.b in + truncate content 5; + { + (* There is no group clause in the query *) + content; + parameters = Queue.to_seq chunk_filters.parameters; + cte = None; + } + | Some expression -> + let filters = + if with_exr then ( + (* If we have additionnals filters from the group clause, we + have to report them in the CTE instead of the main query. *) + let c' = Chunk.copy chunk_filters in + truncate c'.b 5; + Some c') + else None + in + + (* Create the common expression table *) + let cte_parameters = build_cte conf ~expression ~filters in + Chunk.append ~head:chunk_filters ~tail:group; + + { + content = chunk_filters.b; + parameters = Queue.to_seq chunk_filters.parameters; + (* The name is hardcoded here, and used in + [Expression.Filters.window] *) + cte = Some ("cte", cte_parameters); + }) + +type query = { + q : string; + parameters : ImportCSV.DataType.t Seq.t; +} + +(** Build the query and return also the mapping in order to identify each + external links between files. + + The select query will name each column with an alias, and the map allow to + find which source is pointed by this alias. *) +let select : Syntax.t -> query * Path.t ImportExpression.T.t array = + fun conf -> + (* If the filters contains a group expression, we need to transform this into + a CTE, which have to be evaluated before the main query. That’s why we are + evaluating the filters right now.*) + let filters = eval_filters conf in + let b = Buffer.create 256 in + let parameters = Queue.create () in + + Option.iter + (fun (_, (cte : Chunk.t)) -> + Buffer.add_buffer b cte.b; + Queue.add_seq parameters (Queue.to_seq cte.parameters)) + filters.cte; + + (* For each column in the configuration file, add the corresponding element + in the query. + + The Sqlite driver return the elements in an array, we create an array to + in order to manage the elements together. + *) + let headers = Array.make (List.length conf.columns) (Obj.magic None) in + + let columns = List.to_seq conf.columns |> Seq.mapi (fun i c -> (i, c)) in + let formatter = Format.formatter_of_buffer b in + let () = + Format.fprintf formatter "SELECT %a" + (Format.pp_print_seq + ~pp_sep:(fun f () -> Format.fprintf f ",\n") + (fun formatter (i, column) -> + Array.set headers i column; + let p = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer p parameters; + Format.fprintf formatter " AS result_%d" i)) + columns + in + Format.pp_print_flush formatter (); + + let () = create_from_chunck conf (Chunk.create' b parameters) in + + (* If the query has a CTE, link it as well. We use an INNER JOIN here because + we want to be sure to get all the rows fetched by the CTE + *) + let () = + match filters.cte with + | None -> () + | Some (name, _) -> + Buffer.add_string b "\nINNER JOIN '"; + Buffer.add_string b name; + Buffer.add_string b "' ON "; + Buffer.add_string b name; + Buffer.add_string b ".id = "; + Buffer.add_string b conf.source.name; + Buffer.add_string b ".id" + in + + Buffer.add_buffer b filters.content; + Queue.add_seq parameters filters.parameters; + + let formatter = Format.formatter_of_buffer b in + (match conf.Syntax.uniq with + | [] -> () + | uniq -> + Format.fprintf formatter "\nGROUP BY %a" + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f ", ") + (fun formatter column -> + let seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer seq parameters)) + uniq); + (match conf.Syntax.sort with + | [] -> () + | sort -> + Format.fprintf formatter "\nORDER BY %a" + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f ", ") + (fun formatter column -> + let seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer seq parameters)) + sort); + Format.pp_print_flush formatter (); + + ({ q = Buffer.contents b; parameters = Queue.to_seq parameters }, headers) + +let check_external : Syntax.t -> Syntax.extern -> query = + fun conf external_ -> + let extern_table = Table.name external_.target in + + let parameters = Queue.create () in + let internal_key_buffer = Buffer.create 16 in + let formatter = Format.formatter_of_buffer internal_key_buffer in + let internal_key_seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + external_.intern_key + in + Format.pp_print_flush formatter (); + + let external_key_buffer = Buffer.create 16 in + let pointed_tables = pointed_tables conf external_.intern_key in + Buffer.add_string external_key_buffer + (print_column external_.Syntax.target + ("key_" ^ external_.Syntax.target.name)); + + (* We do a copy before the transfert because the Queue is reused later in the + query *) + Queue.transfer (Queue.copy internal_key_seq) parameters; + + let join_content = Buffer.contents external_key_buffer in + let inner_content = Buffer.contents internal_key_buffer in + let b = Buffer.create 256 in + let formatter = Format.formatter_of_buffer b in + let () = + Format.fprintf formatter + "SELECT %a%s FROM%a LEFT JOIN '%s' AS '%s' ON %t = %s WHERE %s IS NULL \ + AND %s IS NOT NULL AND %s <> ''" + (fun formatter -> function + | [ (table, _name) ] -> + Format.fprintf formatter "%s, " (print_column table "id") + | _ -> Format.fprintf formatter "-1, ") + pointed_tables (* *) + inner_content (* *) + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.pp_print_text f ", ") + (fun formatter (table, name) -> + Format.fprintf formatter "\n'%s' AS '%s'" name table.Table.name)) + pointed_tables (* *) + extern_table (* *) + external_.target.name + (prepare_key ~f:(fun b -> + Format.pp_print_text b (Buffer.contents internal_key_buffer))) + join_content (* *) + join_content (* *) + inner_content (* *) + inner_content + in + + Format.pp_print_flush formatter (); + + { q = Buffer.contents b; parameters = Queue.to_seq parameters } + +let build_key_insert : Buffer.t -> Dependency.key -> unit = + fun buffer { Dependency.expression; _ } -> + let show_column : Format.formatter -> Path.column -> unit = + fun formatter column -> Format.fprintf formatter ":col_%d" column + in + + let formatter = Format.formatter_of_buffer buffer in + + let () = + prepare_key formatter ~f:(fun formatter -> + Q.query_of_expression Q.NoParam formatter show_column expression) + in + + Format.pp_print_flush formatter (); + + () diff --git a/lib/analysers/query.mli b/lib/analysers/query.mli new file mode 100644 index 0000000..14d2807 --- /dev/null +++ b/lib/analysers/query.mli @@ -0,0 +1,27 @@ +module Syntax = ImportConf.Syntax + +val create_table : Dependency.t -> string + +type query = { + q : string; (** The query to execute *) + parameters : ImportCSV.DataType.t Seq.t; +} +(** This type represent a query to execute. + [q] is the template to run, and shall be run with all the binded parameters. + *) + +val select : + Syntax.t -> query * ImportDataTypes.Path.t ImportExpression.T.t array + +val check_external : Syntax.t -> Syntax.extern -> query +(** Create a query which select all the missing key in an external *) + +val build_key_insert : Buffer.t -> Dependency.key -> unit +(* Build the fragment insert query. + + This use prepare statement and apply the function at the insert time, in + order to have the key already in the expected format in the database. + + The column are name :col_XX where XX is the index of the column in the + datasheet (starting from 1) +*) diff --git a/lib/configuration/dune b/lib/configuration/dune new file mode 100755 index 0000000..27d31a6 --- /dev/null +++ b/lib/configuration/dune @@ -0,0 +1,29 @@ +(library
+ (name importConf)
+ (libraries
+ decoders
+ otoml
+ menhirLib
+ importCSV
+ yojson
+ re
+ helpers
+ importDataTypes
+ importExpression
+ importErrors
+ )
+
+(preprocess (pps ppx_yojson_conv ppx_deriving.ord))
+)
+
+(rule
+ (targets expression_parser_messages.ml)
+ (deps expression_parser.messages expression_parser.mly)
+ (action (with-stdout-to %{targets} (run menhir --compile-errors %{deps}))))
+
+(menhir
+ (modules expression_parser)
+ (flags --table)
+)
+
+(ocamllex expression_lexer)
diff --git a/lib/configuration/expression_lexer.mll b/lib/configuration/expression_lexer.mll new file mode 100644 index 0000000..cbfc8dc --- /dev/null +++ b/lib/configuration/expression_lexer.mll @@ -0,0 +1,91 @@ +{ + open Expression_parser + module Expression = ImportExpression.T + + exception UnclosedQuote of { content: string ; line : int} +} + + +let spaces = [ ' ' '\t' ] +let letters = [^ '"' '\'' '(' ')' '[' ']' ':' '.' ',' '^' ' ' '\t' '\n' '\r' ] +let digit = [ '0'-'9' ] +let eol = [ '\r' '\n' ] + +let escaped = [ '\'' '\\'] + +rule token = parse +| eol { Lexing.new_line lexbuf; token lexbuf } +| spaces { token lexbuf } +| '\'' { + try read_quoted_string (Buffer.create 17) lexbuf + with Failure _ -> + let line = lexbuf.Lexing.lex_curr_p.pos_lnum + and content = Bytes.to_string lexbuf.Lexing.lex_buffer in + raise (UnclosedQuote {line; content}) +} +| '"' { read_dquoted_string (Buffer.create 17) lexbuf } +| '#' { skip_comment lexbuf } +| '(' { L_PAREN } +| ')' { R_PAREN } +| '[' { L_BRACKET } +| ']' { R_BRACKET } +| ':' { COLUMN } +| '.' { DOT } +| ',' { COMA } +| '^' { CONCAT_OPERATOR } +| '+' { BINARY_OPERATOR (Expression.Add) } +| '-' { BINARY_OPERATOR (Expression.Minus) } +| '/' { BINARY_OPERATOR (Expression.Division) } +| "and" { BOOL_OPERATOR (Expression.And) } +| "or" { BOOL_OPERATOR (Expression.Or) } +| '<' { INEQUALITY_OPERATOR (Expression.LT) } +| '>' { INEQUALITY_OPERATOR (Expression.GT) } +| "<>" { EQUALITY_OPERATOR (Expression.Different) } +| '=' { EQUALITY_OPERATOR (Expression.Equal) } +| digit+ as l { INTEGER l} +| '-' digit+ as l { INTEGER l} +| letters+ as l { IDENT l} +| eof { EOF } + +and skip_comment = parse + | [^ '\r' '\n' ] + { skip_comment lexbuf } + | eol + { token lexbuf } + +(* Read the content until we got another one quote *) +and read_quoted_string buf = parse + | [^ '\'' '\\' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_quoted_string buf lexbuf + } + | "\\\'" + { Buffer.add_char buf '\''; + read_quoted_string buf lexbuf + } + | '\\' + { Buffer.add_char buf '\\'; + read_quoted_string buf lexbuf + } + | '\'' + { LITERAL (Buffer.contents buf) + } + +(* Read the content until we got another one quote *) +and read_dquoted_string buf = parse + | [^ '"' '\\' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_dquoted_string buf lexbuf + } + | "\\\"" + { Buffer.add_char buf '"'; + read_dquoted_string buf lexbuf + } + | '\\' + { Buffer.add_char buf '\\'; + read_dquoted_string buf lexbuf + } + | '"' + { + LITERAL (Buffer.contents buf) + } diff --git a/lib/configuration/expression_parser.messages b/lib/configuration/expression_parser.messages new file mode 100644 index 0000000..ff7e757 --- /dev/null +++ b/lib/configuration/expression_parser.messages @@ -0,0 +1,123 @@ +column_expr: R_PAREN +## + +Invalid expression + +path_expr: IDENT R_PAREN +column_expr: IDENT R_PAREN +column_expr: IDENT L_PAREN IDENT R_PAREN +path_expr: IDENT L_PAREN IDENT R_PAREN +## + +Misplaced function. Did you forgot to quote the text ? + +column_expr: IDENT L_PAREN EOF +path_expr: IDENT L_PAREN EOF +## + +Uncomplete expression + +column_expr: COLUMN R_PAREN +path_expr: COLUMN R_PAREN +## + +The path is missing. + +column_expr: LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL L_PAREN +column_expr: LITERAL CONCAT_OPERATOR LITERAL BINARY_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL BINARY_OPERATOR LITERAL L_PAREN +column_expr: LITERAL CONCAT_OPERATOR LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL CONCAT_OPERATOR LITERAL L_PAREN +column_expr: IDENT L_PAREN L_PAREN LITERAL L_PAREN +path_expr: IDENT L_PAREN LITERAL L_PAREN +## + +A text is given where it was expected a function. + +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +column_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +path_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +path_expr: LITERAL BINARY_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +## +## Ends in an error in state: 61. +## +## separated_nonempty_list(COMA,expr_(path_,COMA)) -> expr_(path_,COMA) COMA . separated_nonempty_list(COMA,expr_(path_,COMA)) [ R_PAREN ] +## +## The known suffix of the stack is as follows: +## expr_(path_,COMA) COMA +## + +Uncomplete expression + +column_expr: IDENT L_PAREN LITERAL COMA R_PAREN +path_expr: IDENT L_PAREN LITERAL COMA R_PAREN + +Misplaced coma + +column_expr: IDENT L_PAREN LITERAL EOF +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN EOF +path_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN EOF +path_expr: IDENT L_PAREN L_PAREN LITERAL EOF +## + +Uncomplete expression. Did you forgot a ')' ? + + +column_expr: LITERAL R_PAREN +path_expr: LITERAL BINARY_OPERATOR LITERAL R_PAREN +## + +Invalid expression + +path_expr: COLUMN IDENT L_PAREN +## +# Also apply to : +# path_expr: COLUMN IDENT COLUMN + +Misplaced path + +path_expr: COLUMN IDENT DOT R_PAREN +## + +Incomplete path: the table is missing + +column_expr: INTEGER BINARY_OPERATOR INTEGER R_PAREN +## + +Unbalanced parens. Did you wanted to write ')' instead of '(' ? + + +path_expr: IDENT L_PAREN L_BRACKET LITERAL R_PAREN + +Unbalanced brackets. Did you wanted to write ']' instead of ')' ? + +column_expr: IDENT L_PAREN LITERAL CONCAT_OPERATOR R_PAREN +path_expr: IDENT L_PAREN LITERAL CONCAT_OPERATOR R_PAREN +column_expr: LITERAL CONCAT_OPERATOR R_PAREN +path_expr: LITERAL CONCAT_OPERATOR R_PAREN +column_expr: IDENT L_PAREN LITERAL BINARY_OPERATOR R_PAREN +path_expr: IDENT L_PAREN LITERAL BINARY_OPERATOR R_PAREN +column_expr: LITERAL BINARY_OPERATOR R_PAREN +path_expr: LITERAL BINARY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR R_PAREN +column_expr: INTEGER INEQUALITY_OPERATOR R_PAREN +path_expr: INTEGER INEQUALITY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR INTEGER INEQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR INTEGER INEQUALITY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR INTEGER EQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR INTEGER EQUALITY_OPERATOR R_PAREN + +The operator expect two arguments. Only one is given + +column_expr: IDENT L_PAREN L_BRACKET R_PAREN +path_expr: IDENT L_PAREN L_BRACKET R_PAREN +column_expr: IDENT L_PAREN L_BRACKET LITERAL R_PAREN +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET +path_expr: INTEGER CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET +path_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET + +Mix between brackets and parens. + diff --git a/lib/configuration/expression_parser.mly b/lib/configuration/expression_parser.mly new file mode 100644 index 0000000..1304c4d --- /dev/null +++ b/lib/configuration/expression_parser.mly @@ -0,0 +1,185 @@ +%token <string>IDENT +%token L_PAREN +%token R_PAREN +%token L_BRACKET R_BRACKET +%token COLUMN +%token DOT +%token <string>LITERAL +%token <string>INTEGER +%token COMA +%token EOF +%token CONCAT_OPERATOR + +%token <ImportExpression.T.binary_operator>BINARY_OPERATOR +%token <ImportExpression.T.binary_operator>INEQUALITY_OPERATOR +%token <ImportExpression.T.binary_operator>EQUALITY_OPERATOR +%token <ImportExpression.T.binary_operator>BOOL_OPERATOR + +%start <ImportDataTypes.Path.t ImportExpression.T.t> path_expr +%start <ImportDataTypes.Path.column ImportExpression.T.t> column_expr + +%right BOOL_OPERATOR +%right INEQUALITY_OPERATOR EQUALITY_OPERATOR +%right CONCAT_OPERATOR BINARY_OPERATOR + +%{ + + let function_of_name param f = + match (String.lowercase_ascii f, param) with + | "nvl", _ -> + ImportExpression.T.Nvl param + | "join", (ImportExpression.T.Literal sep:: tl) -> + ImportExpression.T.Join (sep, tl) + | "join", (ImportExpression.T.Empty:: tl) -> + ImportExpression.T.Join ("", tl) + | "upper", _ -> + ImportExpression.T.Function' (ImportExpression.T.Upper, param) + | "trim", _ -> + ImportExpression.T.Function' (ImportExpression.T.Trim, param) + | other, _ -> + ImportExpression.T.Function (other, param) + +%} + +%% + +path_expr: + | expr_(path_, EOF) EOF { $1 } + | EOF { ImportExpression.T.Empty } +column_expr: + | expr_(column_, EOF) EOF { $1 } + | EOF { ImportExpression.T.Empty } + + +path_: + | COLUMN + column = IDENT + { ImportExpression.T.Path + Syntax.Path.{ alias = None + ; column = ImportCSV.Csv.column_of_string column + } + } + + | COLUMN + table = IDENT + DOT + column = IDENT + { ImportExpression.T.Path + Syntax.Path.{ alias = Some table + ; column = ImportCSV.Csv.column_of_string column} + } + +column_: + | COLUMN + column = IDENT + { try ImportExpression.T.Path (ImportCSV.Csv.column_of_string column) + with _ -> ImportExpression.T.Literal column } + +arguments(PATH): + | L_PAREN + expr = separated_list(COMA, expr_(PATH, COMA)) + R_PAREN + { expr } + +group(PATH): + | L_BRACKET + expr = separated_list(COMA, expr_(PATH, COMA)) + R_BRACKET + { expr } + +fixed(PATH): + | d = INTEGER + { ImportExpression.T.Integer d } + | l = LITERAL + { + if String.equal String.empty l then + ImportExpression.T.Empty + else + ImportExpression.T.Literal l + } + +%inline boperator: + | e = BINARY_OPERATOR { e } + | e = INEQUALITY_OPERATOR { e } + | e = EQUALITY_OPERATOR { e } + | e = BOOL_OPERATOR { e } + +(* The expression evaluation receveive in parameters : + 1. the way to buidl a path, as we have two distinct ways to build them in + the case of externals (the external_key does not allow a table name) + 2. a phantom type telling wich kind of element will end the expression. + This can be EOF for the root expression, or COMA when inside a function. + This prevent merlin to optimize thoses two path, and allow more precise + error messages. *) +expr_(PATH, ENDING_PHANTOM): + | L_PAREN + e = expr_(PATH, R_PAREN) + R_PAREN + { ImportExpression.T.Expr e + } + | + p1 = expr_(PATH, ENDING_PHANTOM) + CONCAT_OPERATOR + p2 = expr_(PATH, COMA) + { match p2 with + | ImportExpression.T.Concat args -> ImportExpression.T.Concat (p1::args) + | _ -> ImportExpression.T.Concat (p1::p2::[]) + } + | p1 = expr_(PATH, ENDING_PHANTOM) + + op = boperator + p2 = expr_(PATH, COMA) + { ImportExpression.T.BOperator (op, p1, p2) } + + | p1 = expr_(PATH, ENDING_PHANTOM) + op = EQUALITY_OPERATOR + p2 = group(PATH) + { ImportExpression.T.GEquality(op, p1, p2) } + + + + | p = PATH + { p } + | f = fixed(PATH) + { f } + | s = IDENT + args = arguments(PATH) + { function_of_name args s } + | + s = IDENT + L_PAREN + opt_arg = opt_arg(PATH, COMA)? + args1 = group(PATH) + COMA + args2 = group(PATH) + R_PAREN + { let window_name = ImportExpression.T.window_of_name s opt_arg in + ImportExpression.T.Window (window_name, args1, args2) } +(* + | (* This case is here to describe a window function which has 2 arguments + level. + I’m not completely satisfied with it, as it prevent the ability to + create a exprpression block with parens arround. *) + s = IDENT + L_PAREN + opt_arg = opt_arg(PATH, COMA)? + args1 = arguments(PATH) + COMA + args2 = arguments(PATH) + R_PAREN + { let window_name = ImportExpression.T.window_of_name s opt_arg in + let expr = ImportExpression.T.Window (window_name, args1, args2) in + + let expr_repr = ImportExpression.Repr.repr ~top:true (fun _ -> "") + expr in + Printf.printf "Deprecated syntax in \"%s\" use [] instead of ()\n" expr_repr; + + + expr + } +*) + +opt_arg(PATH, SEP): + | expr = expr_(PATH, COMA) + SEP + { expr } diff --git a/lib/configuration/importConf.ml b/lib/configuration/importConf.ml new file mode 100644 index 0000000..586be3c --- /dev/null +++ b/lib/configuration/importConf.ml @@ -0,0 +1,90 @@ +open StdLabels +module Syntax = Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module T = Read_conf +module Expression = ImportExpression.T + +let current_syntax = 1 + +let t_of_yojson : Yojson.Safe.t -> Syntax.t = + fun json -> + let keys = Yojson.Safe.Util.keys json in + let version = + match List.find_opt keys ~f:(String.equal "version") with + | None -> + Printf.printf + "No version given. Your setup may break in the future.\n\ + Please add « \"version\":%d » in your configuration.\n\n" + current_syntax; + `Int 1 + | Some _ -> Yojson.Safe.Util.member "version" json + in + + match version with + | `Int 1 -> Of_json.t_of_yojson json + | other -> + Printf.eprintf "Unsuported version : %s\n" (Yojson.Safe.to_string other); + exit 1 + +module TomlReader = Read_conf.Make (Helpers.Toml.Decode) + +let t_of_toml : Otoml.t -> (Syntax.t, string) result = + fun toml -> + let version = + Otoml.find toml (Otoml.get_integer ~strict:false) [ "version" ] + in + match version with + | 1 -> TomlReader.read toml + | _ -> + Printf.eprintf "Unsuported version : %d\n" version; + exit 1 + +let dummy_conf = + Syntax. + { + source = { file = ""; tab = 0; name = "" }; + version = 1; + externals = []; + columns = []; + filters = []; + sort = []; + uniq = []; + } + +let get_table_for_name : Syntax.t -> string option -> Table.t = + fun conf name -> + match name with + | None -> conf.source + | Some name -> + if String.equal name conf.source.name then conf.source + else + let ext = + List.find conf.externals ~f:(fun (ext : Syntax.extern) -> + String.equal name ext.target.name) + in + ext.target + +let root_table : Syntax.t -> Table.t = fun conf -> conf.source + +let get_dependancies_for_table : Syntax.t -> Table.t -> Syntax.extern list = + fun conf source -> + let is_root = source = conf.source in + + List.filter conf.externals ~f:(fun (ext : Syntax.extern) -> + (* Enumerate the intern_key and check the source pointed by each column *) + Expression.fold_values ext.intern_key ~init:false ~f:(fun acc expr -> + if acc then acc + else + match expr.Syntax.Path.alias with + | Some v -> String.equal v source.name + | None -> is_root)) + +let print_path_expression t = ImportExpression.Repr.repr Path.repr t + +let print_extern t = + let toml = Syntax.toml_of_extern t in + Otoml.Printer.to_string toml + +let expression_from_string s = + Read_conf.ExpressionParser.of_string Read_conf.ExpressionParser.path s diff --git a/lib/configuration/importConf.mli b/lib/configuration/importConf.mli new file mode 100644 index 0000000..3a8ae75 --- /dev/null +++ b/lib/configuration/importConf.mli @@ -0,0 +1,23 @@ +module Syntax = Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +val dummy_conf : Syntax.t + +val root_table : Syntax.t -> Table.t +(** Get the root table, this table is the main table to load and each line in + this table will be processed *) + +val t_of_yojson : Yojson.Safe.t -> Syntax.t +val t_of_toml : Otoml.t -> (Syntax.t, string) result +val get_table_for_name : Syntax.t -> string option -> Table.t + +val get_dependancies_for_table : Syntax.t -> Table.t -> Syntax.extern list +(** Get all the externals refered by the source *) + +val print_path_expression : Path.t ImportExpression.T.t -> string + +val expression_from_string : + string -> (Path.t ImportExpression.T.t, string) result + +val print_extern : Syntax.extern -> string diff --git a/lib/configuration/of_json.ml b/lib/configuration/of_json.ml new file mode 100644 index 0000000..f9171b9 --- /dev/null +++ b/lib/configuration/of_json.ml @@ -0,0 +1,134 @@ +open StdLabels +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module Expression = ImportExpression.T + +open Ppx_yojson_conv_lib.Yojson_conv.Primitives + +let current_syntax = 1 + +let rec expression_of_yojson : + (Yojson.Safe.t -> 'a) -> Yojson.Safe.t -> 'a Expression.t = + fun f expr -> + match expr with + | `Null -> Empty + | `List l -> Concat (List.map ~f:(expression_of_yojson f) l) + | `String s as json -> ( + try Path (f json) with + | _ -> Literal s) + | `Assoc [ (fn, `List [ `List l1; `List l2 ]) ] + when String.equal "counter" (String.lowercase_ascii fn) -> + Window + ( Expression.Counter, + List.map ~f:(expression_of_yojson f) l1, + List.map ~f:(expression_of_yojson f) l2 ) + | `Assoc [ (fn, `List [ expr1; `List l2; `List l3 ]) ] + when String.equal "previous" (String.lowercase_ascii fn) -> + Window + ( Expression.Previous (expression_of_yojson f expr1), + List.map ~f:(expression_of_yojson f) l2, + List.map ~f:(expression_of_yojson f) l3 ) + | `Assoc [ (fn, `List l) ] when String.equal "nvl" (String.lowercase_ascii fn) + -> Nvl (List.map ~f:(expression_of_yojson f) l) + | `Assoc [ (fn, `List l) ] -> + Function + (String.lowercase_ascii fn, List.map ~f:(expression_of_yojson f) l) + | json -> ( + try Path (f json) with + | _ -> + let str_json = Yojson.Safe.pretty_to_string json in + raise + (ImportErrors.JsonError { json = str_json; element = "Expression" }) + ) + +type 'a expression = 'a Expression.t +type column = Path.column + +let column_of_yojson : Yojson.Safe.t -> int = function + | `Int i -> i + | `String s -> ImportCSV.Csv.column_of_string s + | _ -> raise (Invalid_argument "column") + +let yojson_of_column i = `String (ImportCSV.Csv.column_to_string i) + +type path = Syntax.Path.t = { + alias : string option; [@default None] [@yojson_drop_default ( = )] + (* External file to load, when the information is missing, load in + the current file *) + column : column; +} +[@@deriving of_yojson] + +let path_of_yojson : Yojson.Safe.t -> path = function + | `String s -> + Scanf.sscanf s ":%s@.%s" (fun table column -> + if String.equal column "" then + { alias = None; column = ImportCSV.Csv.column_of_string table } + else + { + alias = Some table; + column = ImportCSV.Csv.column_of_string column; + }) + | other -> path_of_yojson other + +let yojson_of_path : path -> Yojson.Safe.t = + fun { alias; column } -> + let table = + match alias with + | None -> "" + | Some table -> String.cat table "." + in + + `String + (String.concat ~sep:"" + [ ":"; table; ImportCSV.Csv.column_to_string column ]) + +type table = Table.t = { + file : string; + tab : int; [@default 1] [@yojson_drop_default ( = )] + name : string; +} +[@@deriving of_yojson] + +type extern = { + source : string option; [@default None] [@yojson_drop_default ( = )] + intern_key : column expression; + target : table; + extern_key : column expression; + allow_missing : bool; [@default false] [@yojson_drop_default ( = )] + match_rule : string option; [@default None] [@yojson_drop_default ( = )] +} +[@@deriving of_yojson] + +type syntax_v1_extern = Syntax.extern + +let syntax_v1_extern_of_yojson yojson = + let e = extern_of_yojson yojson in + let intern_key : path Expression.t = + Expression.map e.intern_key ~f:(fun column -> + Syntax.Path.{ column; alias = e.source }) + in + Syntax. + { + extern_key = e.extern_key; + intern_key; + target = e.target; + allow_missing = e.allow_missing; + match_rule = e.match_rule; + } + +type predicate = unit + +let predicate_of_yojson _ = () +let yojson_of_predicate () = `Null + +type t = Syntax.t = { + version : int; [@default current_syntax] + source : table; + externals : syntax_v1_extern list; [@default []] + columns : path expression list; + filters : path expression list; [@default []] [@yojson_drop_default ( = )] + sort : path expression list; [@default []] [@yojson_drop_default ( = )] + uniq : path expression list; [@default []] [@yojson_drop_default ( = )] +} +[@@deriving of_yojson] diff --git a/lib/configuration/read_conf.ml b/lib/configuration/read_conf.ml new file mode 100644 index 0000000..8d467a5 --- /dev/null +++ b/lib/configuration/read_conf.ml @@ -0,0 +1,216 @@ +open StdLabels +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +module ExpressionParser : sig + type 'a path_builder + + val path : Path.t path_builder + val column : Path.column path_builder + + val of_string : + 'a path_builder -> string -> ('a ImportExpression.T.t, string) result +end = struct + module MI = Expression_parser.MenhirInterpreter + module E = MenhirLib.ErrorReports + module L = MenhirLib.LexerUtil + + type error = { + message : string; + start_line : int; + start_pos : int; + end_pos : int; + } + + let range_message start_pos end_pos message = + let start_c = start_pos.Lexing.pos_cnum - start_pos.Lexing.pos_bol + and end_c = end_pos.Lexing.pos_cnum - end_pos.Lexing.pos_bol in + { + message; + start_line = start_pos.Lexing.pos_bol; + start_pos = start_c; + end_pos = end_c; + } + + (** Extract the line in error from the whole expression, and print some + characters just under the faulty part *) + let get_line_error : error -> string -> string = + fun error content -> + let sub_text = + try + let end_pos = String.index_from content error.start_line '\n' in + + String.sub content ~pos:error.start_line + ~len:(end_pos - error.start_line) + with + | Not_found -> + (* There is no new line, extract the ending part *) + let len = String.length content - error.start_line in + String.sub content ~pos:error.start_line ~len + in + (* I’m not sure how to produce it, but the error may be over two lines. + This line is here to prevent the underline to overflow. *) + let stop_pos = min error.end_pos (String.length sub_text) in + let error_length = stop_pos - error.start_pos in + String.concat ~sep:"" + [ + sub_text; + "\n"; + String.make error.start_pos ' '; + String.make error_length '^'; + ] + + let get_parse_error default_position env : error = + match MI.stack env with + | (lazy Nil) -> + range_message default_position.Lexing.lex_start_p + default_position.Lexing.lex_curr_p "Invalid syntax\n" + | (lazy (Cons (MI.Element (state, _, start_pos, end_pos), _))) -> + let message = + try Expression_parser_messages.message (MI.number state) with + | Not_found -> "Invalid syntax (no specific message for this eror)\n" + in + + range_message start_pos end_pos message + + type 'a path_builder = + Lexing.position -> 'a ImportExpression.T.t MI.checkpoint + + let rec _parse lexbuf (checkpoint : 'a ImportExpression.T.t MI.checkpoint) = + match checkpoint with + | MI.InputNeeded _env -> + let token = Expression_lexer.token lexbuf in + let startp = lexbuf.lex_start_p and endp = lexbuf.lex_curr_p in + let checkpoint = MI.offer checkpoint (token, startp, endp) in + _parse lexbuf checkpoint + | MI.Shifting _ | MI.AboutToReduce _ -> + let checkpoint = MI.resume checkpoint in + _parse lexbuf checkpoint + | MI.HandlingError _env -> + let err = get_parse_error lexbuf _env in + Error err + | MI.Accepted v -> Ok v + | MI.Rejected -> + let err = + range_message lexbuf.lex_start_p lexbuf.lex_curr_p + "invalid syntax (parser rejected the input)" + in + Error err + + let of_string : + 'a path_builder -> string -> ('a ImportExpression.T.t, string) result = + fun f str_expression -> + try + let lexbuf = Lexing.from_string str_expression in + let init = f lexbuf.lex_curr_p in + match _parse lexbuf init with + | Ok res -> Ok res + | Error e -> + let message = + String.concat ~sep:"\n" + [ e.message; get_line_error e str_expression ] + in + Error message + with + | Expression_lexer.UnclosedQuote { line; content } -> + let message = + Printf.sprintf "Unclosed quote at line %d : \"%s\"" line content + in + Error message + | e -> + let message = Printexc.to_string e in + Error message + + let path = Expression_parser.Incremental.path_expr + let column = Expression_parser.Incremental.column_expr +end + +module Make (S : Decoders.Decode.S) = struct + let ( let* ) = S.( let* ) + let ( and* ) = S.( and* ) + let ( >>= ) = S.( >>= ) + let ( >|= ) = S.( >|= ) + + class loader = + object (self) + method parse_expression : type a. + a ExpressionParser.path_builder -> + S.value -> + (a ImportExpression.T.t, S.value Decoders.Error.t) result = + fun path -> + S.string >>= fun v -> + match ExpressionParser.of_string path v with + | Ok expr -> S.succeed expr + | Error e -> S.fail_with Decoders.Error.(make e) + + method source = + let* file = S.field "file" S.string + and* name = S.field "name" S.string + and* tab = S.field_opt_or ~default:1 "tab" S.int in + S.succeed { Table.file; name; tab } + + method external_ name = + let* intern_key = + S.field "intern_key" (self#parse_expression ExpressionParser.path) + and* extern_key = + S.field "extern_key" (self#parse_expression ExpressionParser.column) + and* file = S.field "file" S.string + and* tab = S.field_opt_or ~default:1 "tab" S.int + and* allow_missing = + S.field_opt_or ~default:false "allow_missing" S.bool + in + + S.succeed + Syntax. + { + intern_key; + extern_key; + target = { name; file; tab }; + allow_missing; + match_rule = None; + } + + method sheet = + let* columns = + S.field "columns" + @@ S.list (self#parse_expression ExpressionParser.path) + and* filters = + S.field_opt_or ~default:[] "filters" + @@ S.list (self#parse_expression ExpressionParser.path) + and* sort = + S.field_opt_or ~default:[] "sort" + @@ S.list (self#parse_expression ExpressionParser.path) + and* uniq = + S.field_opt_or ~default:[] "uniq" + @@ S.list (self#parse_expression ExpressionParser.path) + in + S.succeed @@ fun version source externals -> + Syntax.{ version; source; externals; columns; filters; sort; uniq } + + method conf = + let* source = S.field "source" self#source + and* externals = + S.field_opt_or ~default:[] "externals" + (S.key_value_pairs_seq self#external_) + in + let* sheet = + S.field "sheet" self#sheet >|= fun v -> v 1 source externals + in + + S.succeed sheet + end + + let read_file file = + S.decode_file (new loader)#conf file + |> Result.map_error (fun v -> + let formatter = Format.str_formatter in + Format.fprintf formatter "%a@." S.pp_error v; + Format.flush_str_formatter ()) + + let read toml = + S.decode_value (new loader)#conf toml + |> Result.map_error (fun v -> + let formatter = Format.str_formatter in + Format.fprintf formatter "%a@." S.pp_error v; + Format.flush_str_formatter ()) +end diff --git a/lib/configuration/syntax.ml b/lib/configuration/syntax.ml new file mode 100644 index 0000000..8efdc59 --- /dev/null +++ b/lib/configuration/syntax.ml @@ -0,0 +1,88 @@ +open StdLabels +module E = ImportExpression.T +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +let toml_of_table Table.{ file; tab; name } = + let values = [ ("file", Otoml.string file); ("name", Otoml.string name) ] in + let values = + match tab with + | 1 -> values + | tab -> ("tab", Otoml.integer tab) :: values + in + + Otoml.table values + +type extern = { + intern_key : Path.t E.t; + target : Table.t; + extern_key : Path.column E.t; + allow_missing : bool; + match_rule : string option; +} + +let toml_of_extern extern = + let values = + [ + ( "intern_key", + Otoml.string + @@ ImportExpression.Repr.repr ~top:true Path.repr extern.intern_key ); + ( "extern_key", + Otoml.string + @@ ImportExpression.Repr.repr ~top:true + (fun v -> ":" ^ ImportCSV.Csv.column_to_string v) + extern.extern_key ); + ("file", Otoml.string extern.target.file); + ("allow_missing", Otoml.boolean extern.allow_missing); + ] + in + + let values = + match extern.target.tab with + | 1 -> values + | tab -> ("tab", Otoml.integer tab) :: values + in + + Otoml.table values + +let toml_of_externs externs = + List.map externs ~f:(fun e -> (e.target.name, toml_of_extern e)) + |> Otoml.table + +type t = { + version : int; + source : Table.t; + externals : extern list; + columns : Path.t E.t list; + filters : Path.t E.t list; + sort : Path.t E.t list; + uniq : Path.t E.t list; +} + +let repr t = + let repr_expression_list l = + Otoml.array + (List.map l ~f:(fun v -> + Otoml.string (ImportExpression.Repr.repr ~top:true Path.repr v))) + in + + let sheet = + Otoml.table + [ + ("columns", repr_expression_list t.columns); + ("filters", repr_expression_list t.filters); + ("sort", repr_expression_list t.sort); + ("uniq", repr_expression_list t.uniq); + ] + in + + let values = + [ + ("version", Otoml.integer t.version); + ("source", toml_of_table t.source); + ("externals", toml_of_externs t.externals); + ("sheet", sheet); + ] + in + + Otoml.table values diff --git a/lib/containers/dune b/lib/containers/dune new file mode 100755 index 0000000..46d0e24 --- /dev/null +++ b/lib/containers/dune @@ -0,0 +1,7 @@ +(library
+ (name importContainers)
+ (libraries
+ importDataTypes
+ importConf
+ )
+)
diff --git a/lib/containers/importContainers.ml b/lib/containers/importContainers.ml new file mode 100644 index 0000000..bf65ba4 --- /dev/null +++ b/lib/containers/importContainers.ml @@ -0,0 +1,61 @@ +module Conf = ImportConf +module Syntax = Conf.Syntax +module Table = ImportDataTypes.Table + +(** This key is used to create the table of each externals in the + configuration. + + This table allow to check if there are cycles between the references *) +module KeyName : sig + type t + + val compare : t -> t -> int + val from_table : Table.t -> t + val hash : t -> int + val equal : t -> t -> bool +end = struct + type t = Table.t + + let compare v1 v2 = String.compare (Table.name v1) (Table.name v2) + (* We use the alias given in the configuration as key, because we want to + be sure there is no cycle in the dependencies. It’s OK to have the same + file used in differents sources, but the sources cannot induce cycles *) + + let from_table (s : Table.t) = s + let hash = Hashtbl.hash + let equal v1 v2 = String.equal (Table.name v1) (Table.name v2) +end + +module Source : sig + type t + + val compare : t -> t -> int + val from_table : Table.t -> t + val hash : t -> int + val equal : t -> t -> bool + val name : t -> KeyName.t +end = struct + type t = Table.t + + let compare v1 v2 = String.compare v1.Table.name v2.Table.name + (* We use the alias given in the configuration as key, because we want to + be sure there is no cycle in the dependencies. It’s OK to have the same + file used in differents sources, but the sources cannot induce cycles *) + + let from_table (s : Table.t) = s + let hash = Hashtbl.hash + let equal v1 v2 = String.equal v1.Table.name v2.Table.name + let name t = KeyName.from_table t +end + +module Externals = MoreLabels.Map.Make (KeyName) +module IntSet = MoreLabels.Set.Make (Int) + +let show_intSet set = + let b = Buffer.create 16 in + IntSet.iter + ~f:(fun v -> + Buffer.add_string b (string_of_int v); + Buffer.add_char b ',') + set; + Buffer.contents b diff --git a/lib/csv/csv.ml b/lib/csv/csv.ml new file mode 100644 index 0000000..db7329d --- /dev/null +++ b/lib/csv/csv.ml @@ -0,0 +1,30 @@ +open StdLabels + +type t = int + +let column_of_char = function + | 'A' .. 'Z' as c -> Char.code c - (Char.code 'A' - 1) + | 'a' .. 'z' as c -> Char.code c - (Char.code 'a' - 1) + | c -> raise (Invalid_argument ("column: " ^ Char.escaped c)) + +let column_of_string : string -> int = + fun s -> + String.fold_left s ~init:0 ~f:(fun value c -> (value * 26) + column_of_char c) + +(** Accumulate the remaining for the successives divisions in a list. *) +let rec _to_char ~b i = + if i > 0 then + let res = i mod 26 in + let res = if res = 0 then 26 else res in + + let c = char_of_int @@ (res + 64) in + (* The modulo is accumulated in the list head, which is the expected + sequence *) + let b = c :: b in + + _to_char ~b @@ ((i - res) / 26) + else b + +let column_to_string i = + let res = _to_char ~b:[] i in + List.to_seq res |> String.of_seq diff --git a/lib/csv/dataType.ml b/lib/csv/dataType.ml new file mode 100644 index 0000000..c582b9c --- /dev/null +++ b/lib/csv/dataType.ml @@ -0,0 +1,21 @@ +let match_date = Re.Str.regexp {|[0-9]+/[0-9]+/[0-9]+|} + +type t = + | Null + | Error of string + | Content of string + | Integer of int + | Float of float + +let to_string = function + | Null -> "" + | Error s -> s + | Integer i -> string_of_int i + | Float f -> string_of_float f + | Content c -> ( + match String.starts_with ~prefix:"0" c with + | false -> c + | true -> + (* If the string is a date, do not escape it *) + if Re.Str.string_match match_date c 0 then c + else String.concat "" [ "=\""; c; "\"" ]) diff --git a/lib/csv/dataType.mli b/lib/csv/dataType.mli new file mode 100644 index 0000000..ebb8bc7 --- /dev/null +++ b/lib/csv/dataType.mli @@ -0,0 +1,8 @@ +type t = + | Null + | Error of string + | Content of string + | Integer of int + | Float of float + +val to_string : t -> string diff --git a/lib/csv/dune b/lib/csv/dune new file mode 100755 index 0000000..b0f4a72 --- /dev/null +++ b/lib/csv/dune @@ -0,0 +1,6 @@ +(library
+ (name importCSV)
+ (libraries
+ re
+ )
+)
diff --git a/lib/data_types/dune b/lib/data_types/dune new file mode 100644 index 0000000..e38310b --- /dev/null +++ b/lib/data_types/dune @@ -0,0 +1,10 @@ +(library + (name importDataTypes) + (libraries + importCSV + ) + + (preprocess (pps ppx_deriving.ord)) + ) + + diff --git a/lib/data_types/path.ml b/lib/data_types/path.ml new file mode 100644 index 0000000..6684b5a --- /dev/null +++ b/lib/data_types/path.ml @@ -0,0 +1,15 @@ +type column = int [@@deriving ord] + +type t = { + alias : string option; + (* External file to load, when the information is missing, load in + the current file *) + column : column; +} +[@@deriving ord] + +let repr { alias; column } = + let column_text = ImportCSV.Csv.column_to_string column in + match alias with + | None -> ":" ^ column_text + | Some value -> ":" ^ value ^ "." ^ column_text diff --git a/lib/data_types/readme.rst b/lib/data_types/readme.rst new file mode 100644 index 0000000..ac609d2 --- /dev/null +++ b/lib/data_types/readme.rst @@ -0,0 +1,4 @@ +This module contains all the types used in the application. + +It does not depends on any other library, and does not cause any dependency +cycle. diff --git a/lib/data_types/table.ml b/lib/data_types/table.ml new file mode 100644 index 0000000..d807c5c --- /dev/null +++ b/lib/data_types/table.ml @@ -0,0 +1,19 @@ +open StdLabels + +type t = { + file : string; + tab : int; + name : string; +} + +(** Get the internal name for the given table. + + This value may differ from the association name given in the configuration. *) +let name : t -> string = + fun source -> + let file_name = + source.file |> Filename.basename |> Filename.remove_extension + in + match source.tab with + | 1 -> file_name + | _ -> String.concat ~sep:"_" [ file_name; string_of_int source.tab ] diff --git a/lib/data_types/types.ml b/lib/data_types/types.ml new file mode 100644 index 0000000..37fd90f --- /dev/null +++ b/lib/data_types/types.ml @@ -0,0 +1,15 @@ +type t = + | Number + | String + | Bool + | None + | Extern + | Float + +let string_of_t : t -> string = function + | Number -> "Number" + | String -> "String" + | Bool -> "Bool" + | None -> "None" + | Extern -> "Extern" + | Float -> "Float" diff --git a/lib/errors/dune b/lib/errors/dune new file mode 100644 index 0000000..ab71219 --- /dev/null +++ b/lib/errors/dune @@ -0,0 +1,9 @@ +(library + (name importErrors) + (libraries + csv + sqlite3 + importCSV + importDataTypes + ) +) diff --git a/lib/errors/importErrors.ml b/lib/errors/importErrors.ml new file mode 100644 index 0000000..04f9deb --- /dev/null +++ b/lib/errors/importErrors.ml @@ -0,0 +1,98 @@ +open StdLabels +module CSV = ImportCSV +module Table = ImportDataTypes.Table + +let bom = "\xEF\xBB\xBF" + +type xlsError = { + source : Table.t; + row : int; + sheet : int; + target : Table.t option; + value : CSV.DataType.t; + exn : exn; +} + +exception + JsonError of { + json : string; + element : string; + } + +exception InvalidEncoding of string +exception NullKey of int +exception SqlError of Sqlite3.Rc.t +exception MisplacedWindow +exception Unknown_source of string +exception Unknown_extension of string + +exception Cycle of string list +(** Cycle between the dpendencies *) + +exception + TypeError of { + expected : ImportDataTypes.Types.t; + actual : ImportDataTypes.Types.t; + expression : string; + subset : string; + } + +let repr_error = function + | SqlError s -> Printf.sprintf "%s Error" (Sqlite3.Rc.to_string s) + | JsonError { json; element } -> Printf.sprintf "%s : %s" element json + | NullKey k -> + Printf.sprintf "The key %s is null" (ImportCSV.Csv.column_to_string k) + | Unknown_source source -> + Printf.sprintf "The source %s is referenced without beiing declared" + source + | MisplacedWindow -> "A group function cannot appear here" + | TypeError { expected; actual; expression; subset } -> + Printf.sprintf + "In this expression %s has type %s but %s was expected:\n%s" subset + (ImportDataTypes.Types.string_of_t actual) + (ImportDataTypes.Types.string_of_t expected) + expression + | Unknown_extension ext -> Printf.sprintf "Unknown file extension %s" ext + | Cycle deps -> + Printf.sprintf "Cycle between the dependencies : %s" + (String.concat ~sep:"," deps) + | Failure e -> e + | e -> + Printexc.print_backtrace Stdlib.stdout; + Printexc.to_string e + +type t = Csv.out_channel Lazy.t + +let output_error : t -> xlsError -> unit = + fun csv error -> + let target = + match error.target with + | None -> "" + | Some value -> value.name + in + + Csv.output_record (Lazy.force csv) + [ + Table.name error.source; + string_of_int error.sheet; + string_of_int error.row; + target; + CSV.DataType.to_string error.value; + repr_error error.exn; + ] + +let log : with_bom:bool -> string -> string -> t = + fun ~with_bom prefix dirname -> + lazy + (let file = Filename.concat dirname (String.cat prefix "_errors.csv") in + + prerr_endline @@ String.cat "Errors found. See the file " file; + + (* Open the error file *) + let channel = Out_channel.open_bin file in + if with_bom then output_string channel bom; + let log_error = Csv.to_channel ~separator:';' ~excel_tricks:true channel in + (* Generate a header for the error file *) + Csv.output_record log_error + [ "Source"; "Tab"; "Row"; "Target"; "Value"; "Error" ]; + log_error) diff --git a/lib/errors/importErrors.mli b/lib/errors/importErrors.mli new file mode 100644 index 0000000..7d17a5d --- /dev/null +++ b/lib/errors/importErrors.mli @@ -0,0 +1,46 @@ +module CSV = ImportCSV + +exception SqlError of Sqlite3.Rc.t +exception MisplacedWindow + +exception Unknown_source of string +(** Exception raised when a source used in a path is not declared as an + external *) + +exception Cycle of string list +(** Cycle between the dpendencies *) + +exception Unknown_extension of string + +exception + JsonError of { + json : string; + element : string; + } + +type xlsError = { + source : ImportDataTypes.Table.t; + row : int; + sheet : int; + target : ImportDataTypes.Table.t option; + value : CSV.DataType.t; + exn : exn; +} + +exception + TypeError of { + expected : ImportDataTypes.Types.t; + actual : ImportDataTypes.Types.t; + expression : string; + subset : string; + } + +exception InvalidEncoding of string +exception NullKey of int + +val repr_error : exn -> string + +type t = Csv.out_channel Lazy.t + +val output_error : t -> xlsError -> unit +val log : with_bom:bool -> string -> string -> t diff --git a/lib/expression/ast.ml b/lib/expression/ast.ml new file mode 100644 index 0000000..ef083e9 --- /dev/null +++ b/lib/expression/ast.ml @@ -0,0 +1,31 @@ +(** This module rebuilds an AST from an evaluation *) + +type 'a repr = 'a T.t +type 'a obs = 'a T.t +type 'a path_repr = unit + +let observe : 'a repr -> 'a obs = Fun.id +let empty : unit -> 'a repr = fun () -> T.Empty +let expr : 'a repr -> 'a repr = fun t -> T.Expr t +let literal : string -> 'a repr = fun s -> T.Literal s +let integer : string -> 'a repr = fun i -> T.Integer i +let path : 'a path_repr -> 'a -> 'a repr = fun _repr p -> T.Path p +let concat : 'a repr list -> 'a repr = fun ll -> T.Concat ll + +let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun w groups order -> T.Window (w, groups, order) + +let nvl : 'a repr list -> 'a repr = fun ll -> T.Nvl ll +let join : string -> 'a repr list -> 'a repr = fun s ll -> T.Join (s, ll) + +let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op e1 e2 -> T.BOperator (op, e1, e2) + +let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op e1 ll -> T.GEquality (op, e1, ll) + +let funct : string -> 'a repr list -> 'a repr = + fun name args -> T.Function (name, args) + +let function' : T.funct -> 'a repr list -> 'a repr = + fun f args -> T.Function' (f, args) diff --git a/lib/expression/compose.ml b/lib/expression/compose.ml new file mode 100644 index 0000000..028602b --- /dev/null +++ b/lib/expression/compose.ml @@ -0,0 +1,150 @@ +open StdLabels + +(** Build an expression module with the result from another expression. The + signature of the fuctions is a bit different, as they all receive the + result from the previous evaluated element in argument. *) +module Expression + (E : Sym.SYM_EXPR) + (R : sig + val v : 'a E.path_repr + end) = +struct + module type SIG = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : 'a E.obs -> 'a repr + val expr : 'a E.obs * 'a repr -> 'a E.obs -> 'a repr + val literal : string -> 'a E.obs -> 'a repr + val integer : string -> 'a E.obs -> 'a repr + val path : 'a path_repr -> 'a -> 'a E.obs -> 'a repr + val concat : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val window : + ('a E.obs * 'a repr) T.window -> + ('a E.obs * 'a repr) list -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val nvl : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val join : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val boperator : + T.binary_operator -> + 'a E.obs * 'a repr -> + 'a E.obs * 'a repr -> + 'a E.obs -> + 'a repr + + val gequality : + T.binary_operator -> + 'a E.obs * 'a repr -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val funct : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val function' : T.funct -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val observe : 'a E.obs * 'a repr -> 'a obs + end + + module Make (M : SIG) = struct + type 'a repr = 'a E.repr * 'a M.repr + type 'a obs = 'a M.obs + type 'a path_repr = 'a M.path_repr + + let map' : 'a repr list -> 'a E.repr list * ('a E.obs * 'a M.repr) list = + fun ll -> + let e = List.map ~f:fst ll in + (e, List.map ll ~f:(fun (e, m) -> (E.observe e, m))) + + let observe : 'a repr -> 'a obs = fun (t, v) -> M.observe (E.observe t, v) + + let empty : unit -> 'a repr = + fun () -> + let e = E.empty () in + (e, M.empty (E.observe e)) + + let expr : 'a repr -> 'a repr = + fun (e, m) -> + let e' = E.expr e in + (e', M.expr (E.observe e, m) (E.observe e')) + + let literal : string -> 'a repr = + fun litt -> + let e = E.literal litt in + (e, M.literal litt (E.observe e)) + + let integer : string -> 'a repr = + fun i -> + let e' = E.integer i in + (e', M.integer i (E.observe e')) + + let path : 'b path_repr -> 'b -> 'a repr = + fun path_repr path -> + let e = E.path R.v path in + let m = M.path path_repr path (E.observe e) in + (e, m) + + let concat : 'a repr list -> 'a repr = + fun reprs -> + let e, m = map' reprs in + let e' = E.concat e in + (e', M.concat m (E.observe e')) + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun window expressions order -> + let e_expressions, m_expressions = map' expressions + and e_order, m_order = map' order + and e_window = T.map_window window ~f:fst + and m_window = T.map_window window ~f:(fun (e, m) -> (E.observe e, m)) in + + let e = E.window e_window e_expressions e_order in + (e, M.window m_window m_expressions m_order (E.observe e)) + + let nvl : 'a repr list -> 'a repr = + fun reprs -> + let e, m = List.split reprs in + + let e' = E.nvl e in + let e = List.map ~f:E.observe e in + (e', M.nvl (List.combine e m) (E.observe e')) + + let join : string -> 'a repr list -> 'a repr = + fun sep reprs -> + let e_reprs, m = map' reprs in + + let e = E.join sep e_reprs in + (e, M.join sep m (E.observe e)) + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op (e1, m1) (e2, m2) -> + let e1' = E.observe e1 + and e2' = E.observe e2 + and e = E.boperator op e1 e2 in + let m' = M.boperator op (e1', m1) (e2', m2) (E.observe e) in + (e, m') + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op (e1, m1) exprs -> + let e_reprs, m_reprs = map' exprs in + let e' = E.gequality op e1 e_reprs in + let m' = M.gequality op (E.observe e1, m1) m_reprs (E.observe e') in + (e', m') + + let funct : string -> 'a repr list -> 'a repr = + fun sep reprs -> + let e_reprs, m = map' reprs in + + let e = E.funct sep e_reprs in + (e, M.funct sep m (E.observe e)) + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f reprs -> + let e_reprs, m = map' reprs in + let e = E.function' f e_reprs in + (e, M.function' f m (E.observe e)) + end +end diff --git a/lib/expression/compose.mli b/lib/expression/compose.mli new file mode 100644 index 0000000..4cced8c --- /dev/null +++ b/lib/expression/compose.mli @@ -0,0 +1,59 @@ +(** Build an expression module with the result from another expression. The + signature of the fuctions is a bit different, as they all receive the + result from the previous evaluated element in argument. *) +module Expression + (E : Sym.SYM_EXPR) + (_ : sig + val v : 'a E.path_repr + end) : sig + (** The signature for the module the adapt is a bit different for the + SYM_EXPR: every function takes an extra argument which is the Expression + we are wrapping, and every expression becomes a tuple with the same + expression represented in the composed type. *) + module type SIG = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : 'a E.obs -> 'a repr + val expr : 'a E.obs * 'a repr -> 'a E.obs -> 'a repr + val literal : string -> 'a E.obs -> 'a repr + val integer : string -> 'a E.obs -> 'a repr + val path : 'a path_repr -> 'a -> 'a E.obs -> 'a repr + val concat : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val window : + ('a E.obs * 'a repr) T.window -> + ('a E.obs * 'a repr) list -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val nvl : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val join : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val boperator : + T.binary_operator -> + 'a E.obs * 'a repr -> + 'a E.obs * 'a repr -> + 'a E.obs -> + 'a repr + + val gequality : + T.binary_operator -> + 'a E.obs * 'a repr -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val funct : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val function' : T.funct -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val observe : 'a E.obs * 'a repr -> 'a obs + end + + module Make (M : SIG) : + Sym.SYM_EXPR + with type 'a obs = 'a M.obs + and type 'a repr = 'a E.repr * 'a M.repr + and type 'a path_repr = 'a M.path_repr +end diff --git a/lib/expression/dune b/lib/expression/dune new file mode 100755 index 0000000..96e386e --- /dev/null +++ b/lib/expression/dune @@ -0,0 +1,9 @@ +(library
+ (name importExpression)
+ (libraries
+ re
+ importCSV
+ importDataTypes
+ importErrors
+ )
+)
diff --git a/lib/expression/filters.ml b/lib/expression/filters.ml new file mode 100644 index 0000000..42c794b --- /dev/null +++ b/lib/expression/filters.ml @@ -0,0 +1,193 @@ +(** This module evaluate the sql query to use in order to filter an expression + + The result is built over [Query] except for the group function, which are + translated into a CTE in sql + *) + +open StdLabels +module Q = Query + +type 'a result = { + repr : Format.formatter -> nested:Query.QueryParameter.t -> unit; + group : 'a T.t option; +} + +module Filter = struct + type 'a repr = { + repr : 'a Q.Query.repr; + with_group : 'a T.t option; + } + + type 'a obs = 'a result + type 'a path_repr = 'a Q.Query.path_repr + + let observe : 'a Ast.obs * 'a repr -> 'a obs = + fun (_, v) -> { repr = Q.Query.observe v.repr; group = v.with_group } + + let empty : 'a Ast.obs -> 'a repr = + fun _ -> { repr = Q.Query.empty (); with_group = None } + + let expr : 'a Ast.obs * 'a repr -> 'a Ast.obs -> 'a repr = + fun (_, expr) _ -> + { repr = Q.Query.expr expr.repr; with_group = expr.with_group } + + let path : 'a path_repr -> 'a -> 'a Ast.obs -> 'a repr = + fun repr p _ -> { repr = Q.Query.path repr p; with_group = None } + + let literal : string -> 'a Ast.obs -> 'a repr = + fun l _ -> { repr = Q.Query.literal l; with_group = None } + + let integer : string -> 'a Ast.obs -> 'a repr = + fun l _ -> { repr = Q.Query.integer l; with_group = None } + + let nvl : ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.nvl expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let concat : ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.concat expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let join : string -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun sep expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression + and with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.join sep expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let boperator : + T.binary_operator -> + 'a Ast.obs * 'a repr -> + 'a Ast.obs * 'a repr -> + 'a Ast.obs -> + 'a repr = + fun name (_, e1) (_, e2) _ -> + let with_group = + match (e1.with_group, e2.with_group) with + | Some e, None -> Some e + | None, Some e -> Some e + | None, None -> None + | _ -> raise ImportErrors.MisplacedWindow + in + { repr = Q.Query.boperator name e1.repr e2.repr; with_group } + + let gequality : + T.binary_operator -> + 'a Ast.obs * 'a repr -> + ('a Ast.obs * 'a repr) list -> + 'a Ast.obs -> + 'a repr = + fun name (_, e1) group _ -> + let group_repr = List.map ~f:(fun v -> (snd v).repr) group + and with_group = List.find_map ~f:(fun v -> (snd v).with_group) group in + + match with_group with + | None -> + { + repr = Q.Query.gequality name e1.repr group_repr; + with_group = e1.with_group; + } + | _ -> raise ImportErrors.MisplacedWindow + + let funct : string -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun name expressions _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expressions in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + in + match with_group with + | None -> { repr = Q.Query.funct name expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let function' : + T.funct -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun name expressions _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expressions in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + in + match with_group with + | None -> + { repr = Q.Query.funct (T.name_of_function name) expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + (** Window functions are not handled in the filters, we save them as an AST + in order to process them in a separated handler. + + It is not allowed to build nested window functions. *) + let window : + ('a Ast.obs * 'a repr) T.window -> + ('a Ast.obs * 'a repr) list -> + ('a Ast.obs * 'a repr) list -> + 'a Ast.obs -> + 'a repr = + fun name expressions order ast -> + ignore name; + let with_group_expr = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + and with_group_order = + List.find_map ~f:(fun v -> (snd v).with_group) order + in + match (with_group_expr, with_group_order) with + | Some _, _ | _, Some _ -> raise ImportErrors.MisplacedWindow + | None, None -> + (* The column name used with the cte. The name is fixed here, and used + as is in [Analysers.Query.build_cte] and + [Analysers.Query.eval_filters] *) + let q = "cte.group0" in + { + with_group = Some ast; + repr = Q.Query.funct "expr" [ Q.Query.literal q ]; + } +end + +module ASTBuilder = + Compose.Expression + (Ast) + (struct + let v = () + end) + +module F : + Sym.SYM_EXPR + with type 'a obs = 'a result + and type 'a path_repr = Format.formatter -> 'a -> unit = + ASTBuilder.Make (Filter) + +module M = Sym.M (F) + +let query_of_expression : + type b. + b Q.binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + b * 'a T.t option = + fun parameter formatter printer expr -> + let repr = M.eval ~path_repr:printer expr in + match parameter with + | BindParam -> + let p = Queue.create () in + let parameter = Q.QueryParameter.Queue p in + let value = F.observe repr in + value.repr ~nested:parameter formatter; + (p, value.group) + | NoParam -> + let value = F.observe repr in + value.repr ~nested:Literal formatter; + ((), value.group) diff --git a/lib/expression/filters.mli b/lib/expression/filters.mli new file mode 100644 index 0000000..d462b5f --- /dev/null +++ b/lib/expression/filters.mli @@ -0,0 +1,9 @@ +module F : Sym.SYM_EXPR with type 'a path_repr = Format.formatter -> 'a -> unit +(** Query used inside the filter clauses *) + +val query_of_expression : + 'b Query.binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + 'b * 'a T.t option diff --git a/lib/expression/headers.ml b/lib/expression/headers.ml new file mode 100644 index 0000000..6371e4f --- /dev/null +++ b/lib/expression/headers.ml @@ -0,0 +1,89 @@ +open StdLabels + +let truncate buffer n = Buffer.truncate buffer (Buffer.length buffer - n) + +module E : + Sym.SYM_CHUNK + with type 'a obs = buffer:Buffer.t -> unit + and type 'a path_repr = 'a -> Buffer.t -> unit = struct + type 'a repr = buffer:Buffer.t -> unit + type 'a obs = buffer:Buffer.t -> unit + type 'a path_repr = 'a -> Buffer.t -> unit + + let group : 'a repr list -> 'a repr = + fun args ~buffer -> + Buffer.add_string buffer "["; + List.iter args ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer ", "); + + truncate buffer 2; + Buffer.add_string buffer "]" + + let arguments : 'a repr list -> 'a repr = + fun expressions ~buffer -> + Buffer.add_string buffer "("; + List.iter expressions ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer ", "); + + truncate buffer 2; + Buffer.add_string buffer ")" + + let observe x ~buffer = x ~buffer + let empty : unit -> 'a repr = fun _ ~buffer -> Buffer.add_string buffer "''" + let path printer p ~buffer = printer p buffer + let literal l ~buffer = Buffer.add_string buffer l + let integer l ~buffer = Buffer.add_string buffer l + + let expr expr ~buffer = + Buffer.add_char buffer '('; + expr ~buffer; + Buffer.add_char buffer ')' + + let nvl expression ~buffer = + Buffer.add_string buffer "nvl"; + arguments ~buffer expression + + let concat expression ~buffer = List.iter expression ~f:(fun v -> v ~buffer) + + let join sep expression ~buffer = + List.iter expression ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer sep); + truncate buffer (String.length sep) + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun name expressions order ~buffer -> + ignore order; + let name = T.name_of_window name in + + Buffer.add_string buffer name; + arguments ~buffer expressions + + let boperator name e1 e2 ~buffer = + e1 ~buffer; + Buffer.add_string buffer (T.name_of_operator name); + e2 ~buffer + + let gequality name e1 e2 ~buffer = + e1 ~buffer; + Buffer.add_string buffer (T.name_of_operator name); + group ~buffer e2 + + let funct name expressions ~buffer = + Buffer.add_string buffer name; + arguments ~buffer expressions + + let function' name expressions ~buffer = + Buffer.add_string buffer (T.name_of_function name); + arguments ~buffer expressions +end + +module M = Sym.M (E) + +let headers_of_expression : + Buffer.t -> ('a -> Buffer.t -> unit) -> 'a T.t -> unit = + fun buffer printer expr -> + let repr = M.eval expr ~path_repr:printer in + E.observe repr ~buffer diff --git a/lib/expression/headers.mli b/lib/expression/headers.mli new file mode 100644 index 0000000..1fafad0 --- /dev/null +++ b/lib/expression/headers.mli @@ -0,0 +1,7 @@ +val headers_of_expression : + Buffer.t -> ('a -> Buffer.t -> unit) -> 'a T.t -> unit + +module E : + Sym.SYM_EXPR + with type 'a obs = buffer:Buffer.t -> unit + and type 'a path_repr = 'a -> Buffer.t -> unit diff --git a/lib/expression/lazier.ml b/lib/expression/lazier.ml new file mode 100644 index 0000000..d8b12d9 --- /dev/null +++ b/lib/expression/lazier.ml @@ -0,0 +1,71 @@ +open StdLabels + +(** Make a module lazy *) +module Make (S : Sym.SYM_EXPR) = struct + type 'a repr = 'a S.repr Lazy.t + type 'a obs = 'a S.obs Lazy.t + type 'a path_repr = 'a S.path_repr + + let empty : unit -> 'a repr = fun () -> lazy (S.empty ()) + + let expr : 'a repr -> 'a repr = + fun expr -> Lazy.map (fun expr -> S.expr expr) expr + + let literal : string -> 'a repr = fun l -> lazy (S.literal l) + let integer : string -> 'a repr = fun i -> lazy (S.integer i) + + let path : 'b path_repr -> 'b -> 'a repr = + fun repr path -> lazy (S.path repr path) + + let concat : 'a repr list -> 'a repr = + fun exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.concat exprs') + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun w group sort -> + lazy + (let w' = T.map_window ~f:Lazy.force w + and group' = List.map ~f:Lazy.force group + and sort' = List.map ~f:Lazy.force sort in + S.window w' group' sort') + + let nvl : 'a repr list -> 'a repr = + fun exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.nvl exprs') + + let join : string -> 'a repr list -> 'a repr = + fun sep exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.join sep exprs') + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op e1 e2 -> + lazy + (let e1' = Lazy.force e1 and e2' = Lazy.force e2 in + S.boperator op e1' e2') + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op e exprs -> + lazy + (let e' = Lazy.force e and exprs' = List.map ~f:Lazy.force exprs in + S.gequality op e' exprs') + + let funct : string -> 'a repr list -> 'a repr = + fun name exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.funct name exprs') + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.function' f exprs') + + let observe : 'a repr -> 'a obs = fun v -> Lazy.map S.observe v +end diff --git a/lib/expression/query.ml b/lib/expression/query.ml new file mode 100644 index 0000000..5bd914a --- /dev/null +++ b/lib/expression/query.ml @@ -0,0 +1,335 @@ +(** + This module create an sql query from an expression. + *) + +open StdLabels + +(** This type is used in the query builder (see [query_of_expression] just + below in order to tell if we need to bind the parameters in the query, or + if we can use plain literal as is (with some risk at the execution time. *) +type _ binded_query = + | BindParam : ImportCSV.DataType.t Queue.t binded_query + | NoParam : unit binded_query + +module QueryParameter = struct + (** Internaly, we need to keep a different type for the Literal chunks + (which requires to be quoted), and raw (which should be given as is to the + sql engine) + + The Raw can be generated from both BindParam or NoParam queries. *) + type t = + | Literal + | Queue of ImportCSV.DataType.t Queue.t + | Raw of t + + (** Wrap the given parameter mode into the raw mode *) + let raw : t -> t = function + | Raw t -> Raw t + | Literal -> Raw Literal + | Queue q -> Raw (Queue q) + + (** Nest the parameter in order to use it inside another function call. + + The rule is to get out of the Raw mode as soon as we dive into another + one function. *) + let nest : t -> t = function + | Raw t -> t + | other -> other +end + +module TypeBuilder = + Compose.Expression + (Type_of) + (struct + let v = ignore + end) + +module Query = TypeBuilder.Make (struct + type 'a repr = Format.formatter -> nested:QueryParameter.t -> unit + type 'a obs = Format.formatter -> nested:QueryParameter.t -> unit + type 'a path_repr = Format.formatter -> 'a -> unit + + let observe : 'a Type_of.obs * 'a repr -> 'a obs = + fun (_, x) formatter ~nested -> + let () = x formatter ~nested in + Format.pp_print_flush formatter () + + (** Unify an external reference with a given type, using the COALESCE + function *) + let unify : + with_:Type_of.t -> + nested:QueryParameter.t -> + Format.formatter -> + 'a Type_of.obs * 'a repr -> + unit = + fun ~with_ ~nested format (type_of, expr) -> + match (type_of, with_) with + | ImportDataTypes.Types.Extern, Number + | ImportDataTypes.Types.Extern, Extern -> + Format.fprintf format "COALESCE(%a,0)" + (fun f expr -> expr f ~nested) + expr + | ImportDataTypes.Types.Extern, String -> + Format.fprintf format "COALESCE(%a,'')" + (fun f expr -> expr f ~nested) + expr + | _, Float -> + Format.fprintf format "CAST(%a AS REAL)" + (fun f expr -> expr f ~nested) + expr + | _, _ -> expr ~nested format + + let empty : 'a Type_of.obs -> 'a repr = + fun type_of formatter ~nested -> + ignore type_of; + ignore nested; + Format.fprintf formatter "''" + + let expr : 'a Type_of.obs * 'a repr -> 'a Type_of.obs -> 'a repr = + fun expr type_of formatter ~nested -> + ignore type_of; + Format.fprintf formatter "("; + (snd expr) ~nested formatter; + Format.fprintf formatter ")" + + let literal : string -> 'a Type_of.obs -> 'a repr = + fun l type_of formatter ~nested -> + ignore type_of; + match nested with + | QueryParameter.Literal -> + (* If the text is a true literal, we insert it directly. This is + only called from the [query_of_expression] function *) + Format.fprintf formatter "'%s'" l + | QueryParameter.Queue queue -> + Format.fprintf formatter "?"; + Queue.add (ImportCSV.DataType.Content l) queue + | QueryParameter.Raw _ -> Format.fprintf formatter "%s" l + + let integer : string -> 'a Type_of.obs -> 'a repr = + fun l type_of formatter ~nested -> + ignore type_of; + ignore nested; + Format.fprintf formatter "%s" l + + let path : 'b path_repr -> 'b -> 'a Type_of.obs -> 'a repr = + fun repr p type_of formatter ~nested -> + ignore nested; + ignore type_of; + repr formatter p + + let concat : ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f " || ") + (unify ~with_:ImportDataTypes.Types.String ~nested:nested') + formatter expression + + let print_expression : + ?sep:string -> + QueryParameter.t -> + Format.formatter -> + ('a Type_of.obs * 'a repr) list -> + unit = + fun ?(sep = ", ") nested formatter expression -> + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f "%s" sep) + (fun f v -> (snd v) f ~nested)) + formatter expression + + (** Format the partition expression. This function is used internally and + only form the expression inside the clause. *) + let group_windows : + QueryParameter.t -> + Format.formatter -> + ('a Type_of.obs * 'a repr) list + * ('a Type_of.obs * 'a repr) list + * string option -> + unit = + fun nested formatter (expressions, order, range) -> + match (expressions, order) with + | [], _ -> () + | _, [] -> + Format.fprintf formatter " OVER (PARTITION BY %a%a)" + (print_expression nested) expressions + (Format.pp_print_option (fun f v -> Format.fprintf f "%s" v)) + range + | _, _ -> + Format.fprintf formatter " OVER (PARTITION BY %a ORDER BY %a%a)" + (print_expression nested) expressions (print_expression nested) order + (Format.pp_print_option (fun f v -> Format.fprintf f "%s" v)) + range + + let window : + ('a Type_of.obs * 'a repr) T.window -> + ('a Type_of.obs * 'a repr) list -> + ('a Type_of.obs * 'a repr) list -> + 'a Type_of.obs -> + 'a repr = + fun name expressions order type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + (* By default, the range is defined like this + + [RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW EXCLUDE NO OTHERS] + + this only build a range until the current row, but in some cases (min, + last), we want to scan the whole group in order to evaluate the value to + keep. + *) + let range = " RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in + match name with + | T.Min expr -> + Format.fprintf formatter "FIRST_VALUE(%a)%a " + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') + (expressions, order, Some range) + | T.Max expr -> + Format.fprintf formatter "LAST_VALUE(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') + (expressions, order, Some range) + | T.Counter -> + (* If no order is given, return the number of elements in the + whole group *) + let operator = + match order with + | [] -> "COUNT" + | _ -> "ROW_NUMBER" + in + Format.fprintf formatter "%s()%a" operator (group_windows nested') + (expressions, order, None) + | T.Previous expr -> + Format.fprintf formatter "LAG(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') (expressions, order, None) + | T.Sum expr -> + Format.fprintf formatter "SUM(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') (expressions, order, None) + + let nvl : ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + Format.fprintf formatter "COALESCE(%a)" (print_expression nested') + expression + + let join : + string -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun sep expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + (* Directly call the literal function for the first argument *) + Format.fprintf formatter "CONCAT(%a, %a)" + (fun f v -> (literal v ImportDataTypes.Types.String) f ~nested:nested') + sep (print_expression nested') expression + + let boperator : + T.binary_operator -> + 'a Type_of.obs * 'a repr -> + 'a Type_of.obs * 'a repr -> + 'a Type_of.obs -> + 'a repr = + fun name e1 e2 type_of formatter ~nested -> + ignore type_of; + (* When dividing, we need to be sure that the type is a float, + otherwise SQL will truncate the result *) + let with_ = + match name with + | T.Division -> ImportDataTypes.Types.Float + | _ -> fst e2 + in + + let nested' = QueryParameter.nest nested in + Format.fprintf formatter "%a%s%a" + (unify ~with_ ~nested:nested') + e1 + (* The operator *) + (T.name_of_operator name) + (unify ~with_:(fst e1) ~nested:nested') + e2 + + let gequality : + T.binary_operator -> + 'a Type_of.obs * 'a repr -> + ('a Type_of.obs * 'a repr) list -> + 'a Type_of.obs -> + 'a repr = + fun name e1 group type_of -> + ignore type_of; + let group_type = List.map ~f:fst group in + fun formatter ~nested -> + let nested' = QueryParameter.nest nested in + let op_name = + match name with + | T.Equal -> " IN(" + | T.Different -> " NOT IN(" + | _ -> "" + in + + Format.fprintf formatter "%a%s%a)" + (unify ~with_:(Type_of.group' group_type) ~nested:nested') + e1 op_name (print_expression nested') group + + let exprs expressions formatter ~nested = + (* Literal expression, starting from now, all the quoted string are + directly given to the sql engine *) + let nested' = QueryParameter.raw nested in + + Format.fprintf formatter "(%a)" + (print_expression ~sep:" " nested') + expressions + + let rec funct : + string -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun name expressions type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + match name with + | "expr" -> + (* Raw expression are parsed directly *) + exprs expressions formatter ~nested + | "if" -> + (* The if is renamed into IIF *) + funct "IIF" expressions type_of formatter ~nested + | _ -> + (* Default case *) + Format.fprintf formatter "%s(%a)" name (print_expression nested') + expressions + + let function' : + T.funct -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun name expressions type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + match name with + | Upper | Trim -> + Format.fprintf formatter "%s(%a)" (T.name_of_function name) + (print_expression nested') expressions +end) + +module M = Sym.M (Query) + +let query_of_expression : + type b. + b binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + b = + fun parameter formatter printer expr -> + let repr = M.eval ~path_repr:printer expr in + match parameter with + | BindParam -> + let p = Queue.create () in + let parameter = QueryParameter.Queue p in + Query.observe repr formatter ~nested:parameter; + p + | NoParam -> + Query.observe repr formatter ~nested:Literal; + () diff --git a/lib/expression/query.mli b/lib/expression/query.mli new file mode 100644 index 0000000..fa789a9 --- /dev/null +++ b/lib/expression/query.mli @@ -0,0 +1,27 @@ +module QueryParameter : sig + (** Internaly, we need to keep a different type for the Literal chunks + (which requires to be quoted), and raw (which should be given as is to the + sql engine) + + The Raw can be generated from both BindParam or NoParam queries. *) + type t = + | Literal + | Queue of ImportCSV.DataType.t Queue.t + | Raw of t +end + +type _ binded_query = + | BindParam : ImportCSV.DataType.t Queue.t binded_query + | NoParam : unit binded_query + +val query_of_expression : + 'b binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + 'b + +module Query : + Sym.SYM_EXPR + with type 'a obs = Format.formatter -> nested:QueryParameter.t -> unit + and type 'a path_repr = Format.formatter -> 'a -> unit diff --git a/lib/expression/repr.ml b/lib/expression/repr.ml new file mode 100644 index 0000000..4990236 --- /dev/null +++ b/lib/expression/repr.ml @@ -0,0 +1,127 @@ +open StdLabels + +let escape_dquote = Re.Str.regexp "'" +let escape content = Re.Str.global_replace escape_dquote "\\'" content + +module E : + Sym.SYM_CHUNK + with type 'a obs = top:bool -> string + and type 'a path_repr = 'a -> string = struct + type 'a repr = top:bool -> string + type 'a obs = top:bool -> string + type 'a path_repr = 'a -> string + + let observe x = x + + let group : 'a repr list -> 'a repr = + fun args ~top -> + let args_repr = List.map ~f:(fun v -> v ~top) args in + let args = String.concat ~sep:", " args_repr in + "[" ^ args ^ "]" + + let arguments : 'a repr list -> 'a repr = + fun args ~top -> + let args_repr = List.map ~f:(fun v -> v ~top) args in + let args = String.concat ~sep:", " args_repr in + "(" ^ args ^ ")" + + let empty : unit -> 'a repr = + fun () ~top -> + match top with + | false -> "''" + | true -> "" + + let literal : string -> 'a repr = + fun l ~top -> + if String.equal String.empty l then (empty ()) ~top + else + match int_of_string_opt l with + | Some _ -> l + | None -> "'" ^ escape l ^ "'" + + let integer : string -> 'a repr = + fun l ~top -> if String.equal String.empty l then (empty ()) ~top else l + + let expr : 'a repr -> 'a repr = + fun expr ~top -> + ignore top; + String.concat ~sep:"" [ "("; expr ~top:false; ")" ] + + let path : 'b path_repr -> 'b -> 'a repr = + fun path_repr p ~top -> + ignore top; + path_repr p + + let concat : 'a repr list -> 'a repr = + fun elems ~top -> + ignore top; + let top = false in + let strs = List.map elems ~f:(fun v -> v ~top) in + String.concat ~sep:" ^ " strs + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun name g1 sort ~top -> + ignore top; + + let args1 = group ~top:false g1 + and args2 = group ~top:false sort + and f_name = T.name_of_window name in + let args = [ args1; args2 ] in + let args = + match name with + | T.Counter -> args + | T.Min prefix_arg + | T.Max prefix_arg + | T.Previous prefix_arg + | T.Sum prefix_arg -> prefix_arg ~top:false :: args + in + + f_name ^ "(" ^ String.concat ~sep:", " args ^ ")" + + let nvl : 'a repr list -> 'a repr = + fun elems ~top -> + ignore top; + let args = arguments ~top:false elems in + "nvl" ^ args + + let join : string -> 'a repr list -> 'a repr = + fun sep elems ~top -> + ignore top; + let header = literal sep in + let args = arguments ~top:false (header :: elems) in + "join" ^ args + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op arg1 arg2 ~top -> + ignore top; + let top = false in + let sep = T.name_of_operator op in + String.concat ~sep [ arg1 ~top; arg2 ~top ] + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op arg1 arg2 ~top -> + ignore top; + let top = false in + let sep = T.name_of_operator op in + let args = group ~top:false arg2 in + String.concat ~sep [ arg1 ~top; args ] + + let funct : string -> 'a repr list -> 'a repr = + fun f args ~top -> + ignore top; + let args = arguments ~top:false args in + f ^ args + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f args ~top -> + ignore top; + let args = arguments ~top:false args in + T.name_of_function f ^ args +end + +module M = Sym.M (E) + +let repr : ?top:bool -> ('a -> string) -> 'a T.t -> string = + fun ?(top = false) printer expr -> + let repr = M.eval ~path_repr:printer expr in + E.observe repr ~top diff --git a/lib/expression/repr.mli b/lib/expression/repr.mli new file mode 100644 index 0000000..4431655 --- /dev/null +++ b/lib/expression/repr.mli @@ -0,0 +1,6 @@ +val repr : ?top:bool -> ('a -> string) -> 'a T.t -> string + +module E : + Sym.SYM_EXPR + with type 'a obs = top:bool -> string + and type 'a path_repr = 'a -> string diff --git a/lib/expression/sym.ml b/lib/expression/sym.ml new file mode 100644 index 0000000..0360e8e --- /dev/null +++ b/lib/expression/sym.ml @@ -0,0 +1,71 @@ +(** The signature for an expression analyzer. + + Every element is mapped to a function, using the tagless final pattern. + + *) +module type SYM_EXPR = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : unit -> 'a repr + val expr : 'a repr -> 'a repr + val literal : string -> 'a repr + val integer : string -> 'a repr + val path : 'a path_repr -> 'a -> 'a repr + val concat : 'a repr list -> 'a repr + val window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr + val nvl : 'a repr list -> 'a repr + val join : string -> 'a repr list -> 'a repr + val boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr + val gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr + val funct : string -> 'a repr list -> 'a repr + val function' : T.funct -> 'a repr list -> 'a repr + val observe : 'a repr -> 'a obs +end + +module type SYM_CHUNK = sig + include SYM_EXPR + + val group : 'a repr list -> 'a repr + val arguments : 'a repr list -> 'a repr +end + +open StdLabels + +module M (Expr : SYM_EXPR) = struct + let rec eval : path_repr:'a Expr.path_repr -> 'a T.t -> 'c Expr.repr = + fun ~path_repr t -> + match t with + | T.Expr expr -> Expr.expr (eval ~path_repr expr) + | T.Empty -> Expr.empty () + | T.Literal s -> Expr.literal s + | T.Integer i -> Expr.integer i + | T.Concat elems -> Expr.concat (List.map elems ~f:(eval ~path_repr)) + | T.Function (name, args) -> + Expr.funct name (List.map args ~f:(eval ~path_repr)) + | T.Function' (name, args) -> + Expr.function' name (List.map args ~f:(eval ~path_repr)) + | T.Nvl elems -> Expr.nvl (List.map elems ~f:(eval ~path_repr)) + | T.Join (sep, args) -> Expr.join sep (List.map args ~f:(eval ~path_repr)) + | T.Window (name, group, sort) -> + Expr.window + (eval_window ~path_repr name) + (List.map group ~f:(eval ~path_repr)) + (List.map sort ~f:(eval ~path_repr)) + | T.BOperator (op, arg1, arg2) -> + Expr.boperator op (eval ~path_repr arg1) (eval ~path_repr arg2) + | T.GEquality (op, arg1, arg2) -> + Expr.gequality op (eval ~path_repr arg1) + (List.map arg2 ~f:(eval ~path_repr)) + | T.Path p -> Expr.path path_repr p + + and eval_window : + path_repr:'a Expr.path_repr -> 'a T.t T.window -> 'a Expr.repr T.window = + fun ~path_repr -> function + | Min a -> Min (eval ~path_repr a) + | Max a -> Max (eval ~path_repr a) + | Counter -> Counter + | Previous a -> Previous (eval ~path_repr a) + | Sum a -> Sum (eval ~path_repr a) +end diff --git a/lib/expression/t.ml b/lib/expression/t.ml new file mode 100644 index 0000000..7e61317 --- /dev/null +++ b/lib/expression/t.ml @@ -0,0 +1,153 @@ +open StdLabels + +type 'a window = + | Min of 'a + | Max of 'a + | Counter + | Previous of 'a + | Sum of 'a + +type 'a t = + | Empty + | Expr of 'a t + | Literal of string + | Integer of string + | Path of 'a + | Concat of 'a t list + | Function of string * 'a t list + | Nvl of 'a t list + | Join of string * 'a t list + | Window of ('a t window * 'a t list * 'a t list) + | BOperator of binary_operator * 'a t * 'a t + | GEquality of binary_operator * 'a t * 'a t list + | Function' of funct * 'a t list + +and binary_operator = + | Equal + | Different + | Add + | Minus + | Division + | LT + | GT + | And + | Or + +and funct = + | Upper + | Trim + +let name_of_function = function + | Upper -> "UPPER" + | Trim -> "TRIM" + +let name_of_operator = function + | Equal -> "=" + | Different -> "<>" + | Add -> "+" + | Minus -> "-" + | Division -> "/" + | LT -> "<" + | GT -> ">" + | And -> " and " + | Or -> " or " + +let name_of_window = function + | Min _ -> "min" + | Max _ -> "max" + | Counter -> "counter" + | Previous _ -> "previous" + | Sum _ -> "sum" + +let map_window : f:('a -> 'b) -> 'a window -> 'b window = + fun ~f -> function + | Min t -> Min (f t) + | Max t -> Max (f t) + | Counter -> Counter + | Previous t -> Previous (f t) + | Sum t -> Sum (f t) + +(** Extract the kind of the window function from the given name. *) +let window_of_name name opt = + match (name, opt) with + | "min", Some p -> Min p + | "max", Some p -> Max p + | "counter", None -> Counter + | "previous", Some p -> Previous p + | "sum", Some p -> Sum p + | _other -> raise Not_found + +let rec cmp : ('a -> 'a -> int) -> 'a t -> 'a t -> int = + fun f e1 e2 -> + match (e1, e2) with + | Empty, Empty -> 0 + | Literal l1, Literal l2 -> String.compare l1 l2 + | Integer l1, Integer l2 -> String.compare l1 l2 + | Path p1, Path p2 -> f p1 p2 + | Concat elems1, Concat elems2 | Nvl elems1, Nvl elems2 -> + List.compare ~cmp:(cmp f) elems1 elems2 + | Function (n1, elems1), Function (n2, elems2) -> + let name_cmp = String.compare n1 n2 in + if name_cmp = 0 then List.compare ~cmp:(cmp f) elems1 elems2 else name_cmp + | Window (s1, l11, l12), Window (s2, l21, l22) -> ( + match compare s1 s2 with + | 0 -> + let l1_cmp = List.compare ~cmp:(cmp f) l11 l21 in + if l1_cmp = 0 then List.compare ~cmp:(cmp f) l12 l22 else l1_cmp + | other -> other) + | BOperator (n1, arg11, arg12), BOperator (n2, arg21, arg22) -> begin + match compare n1 n2 with + | 0 -> begin + match cmp f arg11 arg21 with + | 0 -> cmp f arg12 arg22 + | other -> other + end + | other -> other + end + (* Any other case *) + | other1, other2 -> Stdlib.compare other1 other2 + +let fold_values : f:('b -> 'a -> 'b) -> init:'b -> 'a t -> 'b = + fun ~f ~init expression -> + let rec _f acc = function + | Empty | Literal _ | Integer _ -> acc + | Expr e -> _f acc e + | Path p -> f acc p + | Concat pp | Function' (_, pp) | Function (_, pp) | Nvl pp | Join (_, pp) + -> List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc pp + | Window (window_f, pp1, pp2) -> + (* Each window function can have a distinct parameter first. *) + let acc' = + match window_f with + | Counter -> acc + | Min key | Max key | Previous key | Sum key -> _f acc key + in + let eval1 = List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc' pp1 in + List.fold_left ~f:(fun acc a -> _f acc a) ~init:eval1 pp2 + | BOperator (_, arg1, arg2) -> _f (_f acc arg1) arg2 + | GEquality (_, arg1, arg2) -> + let eval1 = List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc arg2 in + _f eval1 arg1 + in + _f init expression + +let map : type a b. f:(a -> b) -> a t -> b t = + fun ~f expression -> + let rec map = function + | Expr e -> Expr (map e) + | Empty -> Empty + | Literal s -> Literal s + | Integer i -> Integer i + | Path p -> Path (f p) + | Concat pp -> Concat (List.map ~f:map pp) + | Function' (name, pp) -> Function' (name, List.map ~f:map pp) + | Function (name, pp) -> Function (name, List.map ~f:map pp) + | Nvl pp -> Nvl (List.map ~f:map pp) + | Join (sep, pp) -> Join (sep, List.map ~f:map pp) + | Window (window_f, pp1, pp2) -> + let w = map_window ~f:map window_f in + Window (w, List.map ~f:map pp1, List.map ~f:map pp2) + | BOperator (n, arg1, arg2) -> BOperator (n, map arg1, map arg2) + | GEquality (n, arg1, args) -> GEquality (n, map arg1, List.map ~f:map args) + in + map expression diff --git a/lib/expression/t.mli b/lib/expression/t.mli new file mode 100644 index 0000000..840805d --- /dev/null +++ b/lib/expression/t.mli @@ -0,0 +1,54 @@ +type 'a window = + | Min of 'a + | Max of 'a + | Counter + | Previous of 'a + | Sum of 'a + +type 'a t = + | Empty + | Expr of 'a t + | Literal of string + | Integer of string + | Path of 'a + | Concat of 'a t list + | Function of string * 'a t list + | Nvl of 'a t list + | Join of string * 'a t list + | Window of ('a t window * 'a t list * 'a t list) + | BOperator of binary_operator * 'a t * 'a t + | GEquality of binary_operator * 'a t * 'a t list + | Function' of funct * 'a t list + +and binary_operator = + | Equal + | Different + | Add + | Minus + | Division + | LT + | GT + | And + | Or + +and funct = + | Upper + | Trim + +val cmp : ('a -> 'a -> int) -> 'a t -> 'a t -> int +(** Compare two expressions *) + +val fold_values : f:('b -> 'a -> 'b) -> init:'b -> 'a t -> 'b +(** Fold over all the path presents inside the expression. Used for example to + identify all the columns to extract from the file. + + The order is not guarantee to follow the order from the expression *) + +val map : f:('a -> 'b) -> 'a t -> 'b t +(** The map function. Mainly used in the configuration migration. *) + +val name_of_operator : binary_operator -> string +val name_of_window : 'a window -> string +val map_window : f:('a -> 'b) -> 'a window -> 'b window +val window_of_name : string -> 'a option -> 'a window +val name_of_function : funct -> string diff --git a/lib/expression/type_of.ml b/lib/expression/type_of.ml new file mode 100644 index 0000000..ce1a17e --- /dev/null +++ b/lib/expression/type_of.ml @@ -0,0 +1,150 @@ +(** + This module evaluate the type of an expression. + + The type is given with an analysis from all the component involved inside + the exrpssion. It is used inside the [query] module in order to check if one + type need conversion before being used. + *) + +open StdLabels + +module Lazy_Repr = + Compose.Expression + (Lazier.Make + (Repr.E)) + (struct + let v _ = "" + end) + +type t = ImportDataTypes.Types.t + +(** Fold over the list of parameters and ensure all the elements are typed in +the same way *) +let group' : t list -> t = + fun elements -> + List.fold_left elements ~init:None + ~f:(fun (acc : ImportDataTypes.Types.t option) v -> + match acc with + | None -> Some v + | Some t when t = v -> acc + | _ -> Some Extern) + |> Option.value ~default:ImportDataTypes.Types.None + +include Lazy_Repr.Make (struct + type nonrec t = t + type 'a repr = t + type 'a obs = ImportDataTypes.Types.t + type 'a path_repr = 'a -> unit + + let observe : 'a Repr.E.obs Lazy.t * 'a repr -> 'a obs = snd + + let empty : 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ -> ImportDataTypes.Types.None + + let expr : 'a Repr.E.obs Lazy.t * 'a repr -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun e _ -> snd e + + let literal : string -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.String + + let integer : string -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.Number + + let path : 'b path_repr -> 'b -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ _ -> ImportDataTypes.Types.Extern + + let concat : + ('a Repr.E.obs Lazy.t * 'a repr) list -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.String + + let window : + ('a Repr.E.obs Lazy.t * 'a repr) T.window -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions order _ -> + ignore order; + ignore expressions; + match name with + | T.Counter | T.Max _ | T.Min _ | T.Sum _ -> Number + | T.Previous expr -> snd expr + + let nvl : + ('a Repr.E.obs Lazy.t * 'a repr) list -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun v _ -> group' (List.map ~f:snd v) + + let join : + string -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun _ _ _ -> ImportDataTypes.Types.String + + let boperator : + T.binary_operator -> + 'a Repr.E.obs Lazy.t * 'a repr -> + 'a Repr.E.obs Lazy.t * 'a repr -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name _ _ _ -> + match name with + | T.Equal | T.Different | T.LT | T.GT -> Bool + | T.Add | T.Minus -> Number + | T.Division -> Float + | T.And | T.Or -> Bool + + let gequality : + T.binary_operator -> + 'a Repr.E.obs Lazy.t * 'a repr -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name _ _ _ -> + match name with + | T.Equal | T.Different -> Bool + | _ -> None + + let function' : + T.funct -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions _ -> + ignore expressions; + match name with + | Upper | Trim -> String + + let check : expected:t -> actual:t -> string -> 'a Repr.E.obs Lazy.t -> t = + fun ~expected ~actual subset expr -> + if actual = expected then actual + else + let expression = (Lazy.force expr) ~top:false in + raise (ImportErrors.TypeError { expression; subset; expected; actual }) + + let funct : + string -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions repr -> + match name with + | "if" -> begin + match expressions with + | [] -> Extern + | (_, hd) :: arg1 :: _ when hd = Bool -> snd arg1 + | (_, hd) :: _ -> + let expected = ImportDataTypes.Types.Bool and actual = hd in + check ~expected ~actual "the predicate" repr + end + | _ -> Extern +end) + +let group : + ('a Lazier.Make(Repr.E).repr * t) list -> 'a Lazier.Make(Repr.E).repr * t = + fun v -> + let v' = group' (List.map v ~f:snd) in + let l = lazy (Repr.E.empty ()) in + (l, v') + +let arguments = group diff --git a/lib/expression/type_of.mli b/lib/expression/type_of.mli new file mode 100644 index 0000000..7a11582 --- /dev/null +++ b/lib/expression/type_of.mli @@ -0,0 +1,10 @@ +(** This module tries to identify the type of an expression. + +The references to data comming from the spreaedsheet cannot be evaluated and +marked as [Extern]. *) + +type t = ImportDataTypes.Types.t + +include Sym.SYM_CHUNK with type 'a obs = t and type 'a path_repr = 'a -> unit + +val group' : t list -> t diff --git a/lib/file_handler/csv2sql.ml b/lib/file_handler/csv2sql.ml new file mode 100644 index 0000000..42d84eb --- /dev/null +++ b/lib/file_handler/csv2sql.ml @@ -0,0 +1,135 @@ +open StdLabels +module A = ImportAnalyser.Dependency +module CSV = ImportCSV +module C = ImportContainers +module Syntax = ImportConf.Syntax +module Db = ImportSQL.Db + +type state = CSV.DataType.t array State.t + +let default_mapper : + (ImportCSV.DataType.t, ImportCSV.DataType.t array) State.mapper = + { get_row = Fun.id; get_value = Fun.id; default = ImportCSV.DataType.Null } + +let extract_values : string -> CSV.DataType.t = + fun value -> + (* Test first if the content is empty *) + if String.equal String.empty value then CSV.DataType.Null + else + (* else, try differents conversion in order to see which one works *) + match int_of_string_opt value with + | Some i -> CSV.DataType.Integer i + | None -> ( + match float_of_string_opt value with + | Some f -> CSV.DataType.Float f + | None -> + (* And finaly convert into date *) + CSV.DataType.Content value) + +(** Initialize the state for the first row, count the column number and create + the table in the database *) +let first_row : A.t -> _ Db.t -> state -> CSV.DataType.t list -> state = + fun mapping db acc row -> + (if acc.transaction then + match Db.commit db with + | Ok () -> () + | Error e -> print_endline (ImportErrors.repr_error e)); + + ignore @@ Db.create_table db mapping; + let row = Array.of_list row in + match Db.prepare_insert db mapping with + | Ok stmt -> + { + acc with + header = Some row; + transaction = false; + insert_stmt = Some stmt; + row_number = acc.row_number + 1; + } + | _ -> { acc with header = Some row; transaction = false; insert_stmt = None } + +let read_csv_line : + log_error:ImportErrors.t -> A.t -> 'a Db.t -> state -> string list -> state + = + fun ~log_error mapping db acc row -> + let processed_row = + List.to_seq row |> Seq.map extract_values |> Array.of_seq + in + if acc.State.transaction then + State.run_row ~log_error ~mapper:default_mapper mapping db processed_row acc + else + match Db.begin_transaction db with + | Error e -> + print_endline (ImportErrors.repr_error e); + acc + | Ok () -> + let acc = { acc with transaction = true } in + State.run_row ~log_error ~mapper:default_mapper mapping db processed_row + acc + +let importInDatable : + log_error:ImportErrors.t -> + conf:Syntax.t -> + dirname:string -> + A.t -> + 'a Db.t -> + CSV.DataType.t array option Lwt.t = + fun ~log_error ~conf ~dirname mapping db -> + let file = Filename.concat dirname (A.table mapping).file in + + let channel = Stdlib.open_in_bin file in + + let csv_channel = Csv.of_channel ~separator:';' ~excel_tricks:true channel in + + (* In the headers, we only keep the string. + + This line could generate an error if the headers are not correctly defined. + *) + let header = + List.map ~f:(fun v -> CSV.DataType.Content v) (Csv.next csv_channel) + in + + let state = + State. + { + transaction = false; + header = None; + insert_stmt = None; + check_key_stmt = None; + row_number = 1; + sheet_number = 1; + delayed = []; + } + in + let state = first_row mapping db state header in + + let state = + try + Csv.fold_left csv_channel ~init:state + ~f:(read_csv_line ~log_error mapping db) + with + | Csv.Failure (line, row, cause) as e -> + Printf.eprintf "Error %s on line %d — field : %s\n" cause line + (ImportCSV.Csv.column_to_string row); + raise e + in + ignore @@ State.clear ~log_error db mapping conf; + ignore @@ Db.commit db; + + (* Finalize the statements created during the import *) + let () = + Option.iter (fun v -> ignore @@ Db.finalize v) state.insert_stmt; + Option.iter (fun v -> ignore @@ Db.finalize v) state.check_key_stmt + in + + (* Insert all the headers *) + let _ = + Option.iter + (fun headers -> + let values = Array.mapi headers ~f:(fun i value -> (i, value)) in + + ignore + @@ Db.insert_header db (ImportAnalyser.Dependency.table mapping) values) + state.header + in + Lwt.return state.header diff --git a/lib/file_handler/csv2sql.mli b/lib/file_handler/csv2sql.mli new file mode 100644 index 0000000..e09737b --- /dev/null +++ b/lib/file_handler/csv2sql.mli @@ -0,0 +1,10 @@ +val importInDatable : + log_error:ImportErrors.t -> + conf:ImportConf.Syntax.t -> + dirname:string -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + ImportCSV.DataType.t array option Lwt.t +(** Load an excel spreadsheet in an SQLite database. + +Return the header if at least one row where present *) diff --git a/lib/file_handler/dune b/lib/file_handler/dune new file mode 100755 index 0000000..6b247db --- /dev/null +++ b/lib/file_handler/dune @@ -0,0 +1,21 @@ +(library
+ (name importFileHandler)
+ (libraries
+ csv
+ SZXX
+ sqlite3
+ base
+ core
+ lwt
+ lwt.unix
+ helpers
+ importAnalyser
+ importConf
+ importContainers
+ importCSV
+ importDataTypes
+ importErrors
+ importExpression
+ importSQL
+ )
+)
diff --git a/lib/file_handler/state.ml b/lib/file_handler/state.ml new file mode 100644 index 0000000..5b43aff --- /dev/null +++ b/lib/file_handler/state.ml @@ -0,0 +1,178 @@ +open StdLabels +module Table = ImportDataTypes.Table + +type 'a t = { + header : 'a option; + transaction : bool; + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; + row_number : int; + sheet_number : int; + delayed : 'a list; +} + +type insert_result = { + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; +} + +type ('a, 'b) mapper = { + get_row : 'b -> 'a Array.t; + get_value : 'a -> ImportCSV.DataType.t; + default : 'a; +} + +module A = ImportAnalyser.Dependency + +let insert_row : + mapper:(_, 'row) mapper -> + A.t -> + _ ImportSQL.Db.t -> + 'row -> + _ t -> + (insert_result, ImportErrors.xlsError) result = + fun ~mapper mapping db row state -> + (* Extract all columns referenced in the keys or the columns to extract *) + let keys_id = + List.fold_left (A.keys mapping) ~init:ImportContainers.IntSet.empty + ~f:(fun acc (keys : A.key) -> + let columns = keys.A.columns in + ImportContainers.IntSet.union acc (Lazy.force columns)) + and columns_id = A.columns mapping in + let ids = ImportContainers.IntSet.(union keys_id columns_id |> elements) in + + (* Filter only the required columns in the row *) + let values = + List.map ids ~f:(fun i -> + let index = i - 1 in + let value = + try Array.get (mapper.get_row row) index with + | Stdlib.Invalid_argument _ -> + (* If we have more headers than data, assume the value are NULL. + This can happen when all the line tail is empty, Excel can + give us a truncated line instead of a series of NULL *) + mapper.default + in + (index, mapper.get_value value)) + in + let keys = A.keys mapping in + + let execution = + let ( let* ) = Result.bind in + let* check_key_stmt, result = + ImportSQL.Db.eval_key db state.check_key_stmt keys values + in + let no_null = + (* We check if we have at least one key which is not null — and in such + case we ignore the line. + + If multiple keys are presents, we ensure there is at least one non + null here. + *) + match result with + | [] -> true + | _ -> + List.exists result ~f:(function + | Sqlite3.Data.FLOAT _ | Sqlite3.Data.INT _ -> true + | Sqlite3.Data.BLOB t | Sqlite3.Data.TEXT t -> + not (String.equal "" t) + | Sqlite3.Data.NONE | Sqlite3.Data.NULL -> false) + in + let* _ = + match no_null with + | true -> Ok () + | false -> Error (Failure "The key is null") + in + + let* statement = + match state.insert_stmt with + | None -> ImportSQL.Db.prepare_insert db mapping + | Some v -> Ok v + in + let* _ = ImportSQL.Db.insert db statement ~id:state.row_number values in + let* _ = ImportSQL.Db.reset statement in + + Helpers.Console.update_cursor (); + Ok { insert_stmt = Some statement; check_key_stmt } + in + + (* In case of error, wrap the exception to get the line *) + Result.map_error + (fun e -> + ImportErrors. + { + source = ImportAnalyser.Dependency.table mapping; + sheet = state.sheet_number; + row = state.row_number; + target = None; + value = CSV.DataType.Content (String.concat ~sep:", " []); + exn = e; + }) + execution + +(** Load the row with all the informations associated with this sheet. + + If an error has already been raised during the sheet, ignore this row only. *) +let run_row : + log_error:ImportErrors.t -> + mapper:(_, 'row) mapper -> + A.t -> + _ ImportSQL.Db.t -> + 'row -> + 'a t -> + 'a t = + fun ~log_error ~mapper mapping db row state -> + match insert_row ~mapper mapping db row state with + | Ok { insert_stmt; check_key_stmt } -> + { + state with + insert_stmt; + check_key_stmt; + row_number = state.row_number + 1; + } + | Error e -> + Option.iter (fun v -> ignore @@ ImportSQL.Db.finalize v) state.insert_stmt; + Option.iter + (fun v -> ignore @@ ImportSQL.Db.finalize v) + state.check_key_stmt; + ImportErrors.output_error log_error e; + { + state with + insert_stmt = None; + check_key_stmt = None; + row_number = state.row_number + 1; + } + +let clear : + log_error:ImportErrors.t -> + 'a ImportSQL.Db.t -> + A.t -> + ImportConf.Syntax.t -> + unit ImportSQL.Db.result = + fun ~log_error db mapping conf -> + ImportSQL.Db.clear_duplicates db (A.table mapping) (A.keys mapping) + ~f:(fun values -> + let line = + match snd @@ Array.get values 0 with + | ImportCSV.DataType.Integer i -> i + | _ -> -1 + and value = snd @@ Array.get values 1 + and target = + match snd @@ Array.get values 2 with + | ImportCSV.DataType.Content s -> + Some (ImportConf.get_table_for_name conf (Some s)) + | _ -> None + in + let error = + ImportErrors. + { + source = A.table mapping; + sheet = (A.table mapping).tab; + row = line; + target; + value; + exn = Failure "Duplicated key"; + } + in + + ImportErrors.output_error log_error error) diff --git a/lib/file_handler/state.mli b/lib/file_handler/state.mli new file mode 100644 index 0000000..f744c33 --- /dev/null +++ b/lib/file_handler/state.mli @@ -0,0 +1,46 @@ +type 'a t = { + header : 'a option; + transaction : bool; + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; + row_number : int; + sheet_number : int; + delayed : 'a list; +} + +type insert_result = { + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; +} + +type ('a, 'b) mapper = { + get_row : 'b -> 'a Array.t; + get_value : 'a -> ImportCSV.DataType.t; + default : 'a; +} + +val insert_row : + mapper:(_, 'row) mapper -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + 'row -> + _ t -> + (insert_result, ImportErrors.xlsError) result +(** Low level row insertion *) + +val run_row : + log_error:ImportErrors.t -> + mapper:(_, 'row) mapper -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + 'row -> + 'a t -> + 'a t + +val clear : + log_error:ImportErrors.t -> + 'a ImportSQL.Db.t -> + ImportAnalyser.Dependency.t -> + ImportConf.Syntax.t -> + unit ImportSQL.Db.result +(** Clean up the table after the insertion, check for the duplicates and external references *) diff --git a/lib/file_handler/xlsx2sql.ml b/lib/file_handler/xlsx2sql.ml new file mode 100644 index 0000000..f2d8f12 --- /dev/null +++ b/lib/file_handler/xlsx2sql.ml @@ -0,0 +1,205 @@ +open StdLabels +module A = ImportAnalyser.Dependency +module C = ImportContainers +module CSV = ImportCSV +module Syntax = ImportConf.Syntax +module Db = ImportSQL.Db + +let flags = Unix.[ O_RDONLY; O_NONBLOCK ] + +let extractors = + SZXX.Xlsx. + { + string = (fun _location s -> CSV.DataType.Content s); + error = + (fun _location s -> CSV.DataType.Error (Printf.sprintf "#ERROR# %s" s)); + boolean = + (fun _location s -> + let value = String.(equal s "1") in + CSV.DataType.Content (string_of_bool value)); + number = + (fun _location s -> + let f = Float.of_string s in + if Float.is_integer f then CSV.DataType.Integer (Float.to_int f) + else CSV.DataType.Float f); + date = (fun _location s -> CSV.DataType.Content s); + null = CSV.DataType.Null; + formula = + (fun _location ~formula s -> + ignore formula; + CSV.DataType.Content s); + } + +let feed_bigstring ic = + let open Lwt.Infix in + let len = Lwt_io.buffer_size ic in + let buf = Lwt_bytes.create len in + SZXX.Zip.Bigstring + (fun () -> + Lwt_io.read_into_bigstring ic buf 0 len >|= function + | 0 -> None + | len -> Some SZXX.Zip.{ buf; pos = 0; len }) + +(* Evaluate if the row can be processed right now (does not contain + any delayed value) *) +let is_delayed row = + Array.exists row.SZXX.Xlsx.data ~f:(function + | SZXX.Xlsx.Delayed _ -> true + | _ -> false) + +let default_mapper : + (ImportCSV.DataType.t, ImportCSV.DataType.t SZXX.Xlsx.row) State.mapper = + { + get_value = + (function + | ImportCSV.DataType.Content s -> + ImportCSV.DataType.Content (SZXX.Xml.unescape s) + | any -> any); + default = ImportCSV.DataType.Null; + get_row = (fun v -> v.SZXX.Xlsx.data); + } + +type state = CSV.DataType.t SZXX.Xlsx.status SZXX.Xlsx.row State.t + +let delayed_mapper = + State. + { + get_value = + (function + | SZXX.Xlsx.Available (CSV.DataType.Content s) -> + CSV.DataType.Content (SZXX.Xml.unescape s) + | SZXX.Xlsx.Available value -> value + | _ -> CSV.DataType.Null); + default = SZXX.Xlsx.Available CSV.DataType.Null; + get_row = (fun v -> v.SZXX.Xlsx.data); + } + +(** Initialize the state for the first row, count the column number and create + the table in the database *) +let first_row : A.t -> _ Db.t -> state -> 'a SZXX.Xlsx.row -> state = + fun mapping db acc row -> + (if acc.transaction then + match Db.commit db with + | Ok () -> () + | Error e -> print_endline (ImportErrors.repr_error e)); + + ignore @@ Db.create_table db mapping; + match Db.prepare_insert db mapping with + | Ok stmt -> + { + acc with + header = Some row; + transaction = false; + insert_stmt = Some stmt; + } + | _ -> { acc with header = Some row; transaction = false; insert_stmt = None } + +let importInDatable : + log_error:Csv.out_channel Lazy.t -> + conf:Syntax.t -> + dirname:string -> + A.t -> + 'a Db.t -> + CSV.DataType.t array option Lwt.t = + fun ~log_error ~conf ~dirname mapping db -> + let file = Filename.concat dirname (A.table mapping).file in + + Lwt_io.with_file ~flags ~mode:Input file (fun ic -> + let open Lwt.Syntax in + let stream, sst_p, success = + SZXX.Xlsx.stream_rows ~only_sheet:(A.table mapping).tab + ~feed:(feed_bigstring ic) extractors + in + let* processed = + Lwt_stream.fold + (fun row acc -> + (* Create the table on the first line *) + if Int.equal 1 row.SZXX.Xlsx.row_number then + first_row mapping db acc row + else + match is_delayed row with + | true -> { acc with delayed = row :: acc.delayed } + | false -> ( + let row_number = row.SZXX.Xlsx.row_number in + if acc.transaction then + State.run_row ~log_error ~mapper:delayed_mapper mapping db + row { acc with row_number } + else + match Db.begin_transaction db with + | Error e -> + print_endline (ImportErrors.repr_error e); + acc + | Ok () -> + let acc = { acc with transaction = true; row_number } in + State.run_row ~log_error ~mapper:delayed_mapper mapping + db row acc)) + stream + { + transaction = false; + header = None; + delayed = []; + insert_stmt = None; + check_key_stmt = None; + row_number = 1; + sheet_number = (A.table mapping).tab; + } + in + (* Wait to reach the sst *) + let* sst = sst_p in + + if processed.transaction then ignore (Db.commit db); + + (* Insert the missing elements *) + ignore @@ Db.begin_transaction db; + List.iter processed.delayed ~f:(fun row -> + let fully_available_row = + SZXX.Xlsx.unwrap_status extractors sst row + in + + let row_number = row.SZXX.Xlsx.row_number in + + match + State.insert_row ~mapper:default_mapper mapping db + fully_available_row + { processed with row_number } + with + | Ok _ -> () + | Error e -> + ImportErrors.output_error log_error e; + ()); + + ignore @@ State.clear ~log_error db mapping conf; + ignore @@ Db.commit db; + + (* Finalize the statements created during the import *) + let () = + Option.iter (fun v -> ignore @@ Db.finalize v) processed.insert_stmt; + Option.iter (fun v -> ignore @@ Db.finalize v) processed.check_key_stmt + in + + let _ = + Option.iter + (fun headers -> + let res = SZXX.Xlsx.unwrap_status extractors sst headers in + + let values = Array.mapi res.data ~f:(fun i value -> (i, value)) in + + ignore + @@ Db.insert_header db + (ImportAnalyser.Dependency.table mapping) + values) + processed.header + in + + let header = + Option.map + (fun header -> + let res = SZXX.Xlsx.unwrap_status extractors sst header in + res.data) + processed.header + in + + (* Finalize the process *) + let* () = success in + + Lwt.return header) diff --git a/lib/file_handler/xlsx2sql.mli b/lib/file_handler/xlsx2sql.mli new file mode 100644 index 0000000..e09737b --- /dev/null +++ b/lib/file_handler/xlsx2sql.mli @@ -0,0 +1,10 @@ +val importInDatable : + log_error:ImportErrors.t -> + conf:ImportConf.Syntax.t -> + dirname:string -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + ImportCSV.DataType.t array option Lwt.t +(** Load an excel spreadsheet in an SQLite database. + +Return the header if at least one row where present *) diff --git a/lib/helpers/console.ml b/lib/helpers/console.ml new file mode 100644 index 0000000..838b25a --- /dev/null +++ b/lib/helpers/console.ml @@ -0,0 +1,16 @@ +let cursors = [| '|'; '/'; '-'; '\\' |] +let pos = ref 0 + +let update_cursor () = + if Unix.(isatty stdout) then ( + Printf.printf "%c[?25l%c[1D%c[0K%c" (char_of_int 27) (char_of_int 27) + (char_of_int 27) (Array.get cursors !pos); + pos := (!pos + 1) mod Array.length cursors) + +let close_cursor () = + if Unix.(isatty stdout) then + Printf.printf "%c[?25h%c[1D%c[0K\n%!" (char_of_int 27) (char_of_int 27) + (char_of_int 27) + +let clear_line () = + if Unix.(isatty stdout) then Printf.printf "%c[2K\r%!" (char_of_int 27) diff --git a/lib/helpers/console.mli b/lib/helpers/console.mli new file mode 100644 index 0000000..289d55c --- /dev/null +++ b/lib/helpers/console.mli @@ -0,0 +1,5 @@ +val update_cursor : unit -> unit +val close_cursor : unit -> unit + +val clear_line : unit -> unit +(** Clear the entire line *) diff --git a/lib/helpers/dune b/lib/helpers/dune new file mode 100755 index 0000000..8e30d2b --- /dev/null +++ b/lib/helpers/dune @@ -0,0 +1,8 @@ +(library
+ (name helpers)
+ (libraries
+ calendar
+ decoders
+ otoml
+ )
+)
diff --git a/lib/helpers/helpers.ml b/lib/helpers/helpers.ml new file mode 100755 index 0000000..9d6fcb8 --- /dev/null +++ b/lib/helpers/helpers.ml @@ -0,0 +1,45 @@ +module Toml = Toml
+module Console = Console
+
+let date_from_csv : string -> CalendarLib.Date.t option =
+ fun value ->
+ let open CalendarLib.Date in
+ try Some (Scanf.sscanf value "%d/%d/%d" (fun d m y -> make y m d)) with
+ | _ -> (
+ (* If the date is a number, try from julian day *)
+ match int_of_string_opt value with
+ | None -> None
+ | Some v -> Some (add (make 1899 12 30) (Period.day v)))
+
+let fold_opt : ('a -> 'b -> 'a option) -> 'a -> 'b -> 'a =
+ fun f acc b ->
+ match f acc b with
+ | None -> acc
+ | Some v -> v
+
+let try_opt exp =
+ try Some (exp ()) with
+ | _ -> None
+
+let repr_date formatter date =
+ Format.fprintf formatter "%02d/%02d/%d"
+ (CalendarLib.Date.day_of_month date)
+ CalendarLib.Date.(int_of_month @@ month date)
+ (CalendarLib.Date.year date)
+
+let s_repr_date date =
+ Format.sprintf "%02d/%02d/%d"
+ (CalendarLib.Date.day_of_month date)
+ CalendarLib.Date.(int_of_month @@ month date)
+ (CalendarLib.Date.year date)
+
+let repr_opt f channel = function
+ | None -> ()
+ | Some v -> f channel v
+
+let str_format f =
+ let buffer = Buffer.create 16 in
+ let formatter = Format.formatter_of_buffer buffer in
+ f formatter;
+ Format.pp_print_flush formatter ();
+ Buffer.contents buffer
diff --git a/lib/helpers/toml.ml b/lib/helpers/toml.ml new file mode 100644 index 0000000..1b7fb15 --- /dev/null +++ b/lib/helpers/toml.ml @@ -0,0 +1,31 @@ +module Decode = struct + module S = struct + type value = Otoml.t + + let pp : Format.formatter -> value -> unit = + fun format t -> Format.pp_print_string format (Otoml.Printer.to_string t) + + let of_string : string -> (value, string) result = + Otoml.Parser.from_string_result + + let of_file : string -> (value, string) result = + Otoml.Parser.from_file_result + + let get_string : value -> string option = Otoml.get_opt Otoml.get_string + let get_int : value -> int option = Otoml.get_opt Otoml.get_integer + let get_float : value -> float option = Otoml.get_opt Otoml.get_float + let get_bool : value -> bool option = Otoml.get_opt Otoml.get_boolean + let get_null : value -> unit option = fun _ -> None + + let get_list : value -> value list option = + Otoml.get_opt @@ Otoml.get_array Fun.id + + let get_key_value_pairs : value -> (value * value) list option = + Otoml.get_opt (fun key -> + Otoml.get_table key |> List.map (fun (k, v) -> (Otoml.string k, v))) + + let to_list : value list -> value = Otoml.array + end + + include Decoders.Decode.Make (S) +end diff --git a/lib/helpers/toml.mli b/lib/helpers/toml.mli new file mode 100644 index 0000000..08d30b8 --- /dev/null +++ b/lib/helpers/toml.mli @@ -0,0 +1 @@ +module Decode : Decoders.Decode.S with type value = Otoml.t diff --git a/lib/sql/date.ml b/lib/sql/date.ml new file mode 100644 index 0000000..e8933c7 --- /dev/null +++ b/lib/sql/date.ml @@ -0,0 +1,24 @@ +(** Parse a text value into a date *) + +let first_day = CalendarLib.Date.make 1899 12 30 + +let f : Sqlite3.Data.t -> Sqlite3.Data.t -> Sqlite3.Data.t = + fun str data -> + match (str, data) with + | Sqlite3.Data.TEXT format_, Sqlite3.Data.TEXT content -> ( + try + let date = CalendarLib.Printer.Date.from_fstring format_ content in + let period = + CalendarLib.Date.sub date first_day + |> CalendarLib.Date.Period.nb_days |> Int64.of_int + in + Sqlite3.Data.INT period + with + | Invalid_argument e -> + prerr_endline e; + Sqlite3.Data.NULL) + | _ -> + (* If the data is already a date, it should be preserved as is *) + data + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun2 db "date" f diff --git a/lib/sql/db.ml b/lib/sql/db.ml new file mode 100644 index 0000000..89431b1 --- /dev/null +++ b/lib/sql/db.ml @@ -0,0 +1,383 @@ +open StdLabels +module CSV = ImportCSV +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +type 'a t = 'a T.t + +let ( let* ) res cont = Result.bind (T.to_result res) cont +let ( let** ) res cont = Result.bind res cont +let begin_transaction = T.begin_transaction +let rollback = T.rollback +let commit = T.commit +let finalize = T.finalize +let reset = T.reset +let insert_header = Header.insert_header +let query_headers = Header.query_headers + +let with_db : string -> (Sqlite3.db -> unit T.result) -> unit T.result = + fun filename f -> + let db = Sqlite3.db_open filename in + + Match.register db; + Date.register db; + Year.register db; + Join.register db; + Math.register db; + Trim.register db; + + (*let* _ = Sqlite3.exec db "PRAGMA foreign_keys = ON" |> to_result in*) + Sqlite3.( let& ) db f + +let create_table : 'a t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let name = Table.name source in + + let* _ = + Sqlite3.exec db + (String.concat ~sep:"" [ "DROP TABLE IF EXISTS '"; name; "'" ]) + in + + let query = ImportAnalyser.Query.create_table table in + let* _ = Sqlite3.exec db query in + match Header.create_table db with + | Ok () -> Hashs.create_table db + | e -> e + +let update_hash : 'a t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db mapping -> + match Hashs.insert db mapping with + | Ok () -> Ok () + | Error _ -> + let _ = Hashs.create_table db in + Hashs.insert db mapping + +let check_table_schema : 'a t -> ImportAnalyser.Dependency.t -> bool T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let name = Table.name source in + let query = + String.concat ~sep:"" + [ "SELECT sql FROM sqlite_schema WHERE name = '"; name; "'" ] + in + let stmt = Sqlite3.prepare db query in + + let rc, result = + Sqlite3.fold stmt ~init:false ~f:(fun value row -> + if Array.length row <> 1 then value + else + match Sqlite3.Data.to_string (Array.get row 0) with + | Some s -> + let query = ImportAnalyser.Query.create_table table in + String.equal s query + | None -> value) + in + let* _ = rc in + + (* The schema is the same, now check the hash in case the indexes changed *) + let rc_hash = Hashs.query db source in + match rc_hash with + | Ok (Some i) -> + let hash = Hashs.evaluate table in + begin + if i == hash then Ok result + else + let _ = update_hash db table in + Ok false + end + | _ -> + let _ = update_hash db table in + Ok result + +let prepare_insert : + Sqlite3.db -> ImportAnalyser.Dependency.t -> Sqlite3.stmt T.result = + fun db mapping -> + (* Get the list of columns from the table configuration *) + let columns = + ImportAnalyser.Dependency.columns mapping + |> ImportContainers.IntSet.elements + in + let table_name = Table.name (ImportAnalyser.Dependency.table mapping) in + + let open Buffer in + let buff = create 20 and value_buff = create 10 and col_buff = create 10 in + + add_string col_buff "'id',"; + + (* Add the key name if present *) + List.iter (ImportAnalyser.Dependency.keys mapping) + ~f:(fun { ImportAnalyser.Dependency.name; _ } -> + add_string col_buff "'key_"; + + add_string col_buff name; + add_string col_buff "',"); + + add_string value_buff ":id,"; + + (* Add the key settings if presents *) + List.iter (ImportAnalyser.Dependency.keys mapping) ~f:(fun key -> + ImportAnalyser.Query.build_key_insert value_buff key; + add_string value_buff ","); + + List.iter columns ~f:(fun id -> + add_string col_buff "'col_"; + add_string col_buff (string_of_int id); + add_string col_buff "',"; + + let col_name = ":col_" ^ string_of_int id in + + add_string value_buff col_name; + add_string value_buff ","); + + truncate col_buff (length col_buff - 1); + truncate value_buff (length value_buff - 1); + add_string buff "INSERT INTO '"; + add_string buff table_name; + add_string buff "' ("; + add_buffer buff col_buff; + add_string buff " ) VALUES ("; + add_buffer buff value_buff; + add_string buff " )"; + + let query = contents buff in + + try Ok (Sqlite3.prepare db query) with + | e -> + print_endline "Error during this query :"; + print_endline query; + Error e + +let eval_key : + 'a t -> + Sqlite3.stmt option -> + ImportAnalyser.Dependency.key list -> + (int * CSV.DataType.t) list -> + (Sqlite3.stmt option * Sqlite3.Data.t list) T.result = + fun db stmt keys values -> + match keys with + | [] -> Ok (None, []) + | _ -> + let buffer = Buffer.create 16 in + Buffer.add_string buffer "SELECT "; + List.iter keys ~f:(fun key -> + ImportAnalyser.Query.build_key_insert buffer key; + Buffer.add_string buffer ","); + + Buffer.truncate buffer (Buffer.length buffer - 1); + let query = Buffer.contents buffer in + + let statement = Sqlite3.prepare_or_reset db (ref stmt) query in + + (* Extract all the column id used in the keys. + *) + let keys_id = + List.fold_left keys ~init:ImportContainers.IntSet.empty + ~f:(fun acc (keys : ImportAnalyser.Dependency.key) -> + let columns = Lazy.force keys.ImportAnalyser.Dependency.columns in + ImportContainers.IntSet.union acc columns) + in + + let** _ = + List.fold_left values ~init:(Ok 1) ~f:(fun idx (id, value) -> + let** idx = idx in + + (* Ensure the column is required in the keys *) + match ImportContainers.IntSet.mem (1 + id) keys_id with + | false -> Ok (idx + 1) + | true -> + let sql_data = T.of_datatype value in + + let col_name = ":col_" ^ string_of_int (1 + id) in + let* _ = Sqlite3.bind_name statement col_name sql_data in + Ok (idx + 1)) + in + + let result, evaluated_keys = + Sqlite3.fold statement ~init:[] ~f:(fun _ v -> Array.to_list v) + in + let* _ = result in + Ok (Some statement, evaluated_keys) + +let insert : + Sqlite3.db -> + Sqlite3.stmt -> + id:int -> + (int * CSV.DataType.t) list -> + unit T.result = + fun db statement ~id values -> + let** _ = T.savepoint db "PREVIOUS" in + let* _ = + Sqlite3.bind_name statement ":id" (Sqlite3.Data.INT (Int64.of_int id)) + in + let** _ = + List.fold_left values ~init:(Ok 1) ~f:(fun idx (id, value) -> + let** idx = idx in + let sql_data = T.of_datatype value in + + let col_name = ":col_" ^ string_of_int (1 + id) in + let* _ = Sqlite3.bind_name statement col_name sql_data in + + Ok (idx + 1)) + in + + match T.to_result (Sqlite3.step statement) with + | Ok () -> T.release db "PREVIOUS" + | Error e -> + (* I intentionnaly ignore any error here, as we are already in an + error case *) + ignore (Sqlite3.exec db "ROLLBACK TRANSACTION TO SAVEPOINT PREVIOUS"); + Error e + +(** This simple function convert a query generated by the application into a + statement executed with sqlite. + + The function expect a perfect match between the query and the parameters. *) +let execute_query : + Sqlite3.db -> ImportAnalyser.Query.query -> Sqlite3.stmt T.result = + fun db query -> + let statement = + try Sqlite3.prepare db query.q with + | e -> + print_endline "Error during this query :"; + print_endline query.q; + raise e + in + + (* Add the binded parameters *) + let values = + Seq.map (fun v -> T.of_datatype v) query.parameters |> List.of_seq + in + + let* _ = Sqlite3.bind_values statement values in + + Ok statement + +let query : + f:((Path.t ImportExpression.T.t * CSV.DataType.t) array -> unit) -> + Sqlite3.db -> + Syntax.t -> + unit T.result = + fun ~f db output -> + (* Extract the query from the configuration. *) + let** query_analysis = + match ImportAnalyser.Query.select output with + | exception e -> Error e + | other -> Ok other + in + + let query, columns = query_analysis in + let** statement = execute_query db query in + + let* _ = + Sqlite3.iter statement ~f:(fun data -> + let values = + Array.map2 data columns ~f:(fun value column -> + (column, T.to_datatype value)) + in + f values) + in + Ok () + +let create_view : Sqlite3.db -> Syntax.t -> unit T.result = + fun db output -> + ignore output; + let* drop = Sqlite3.exec db "DROP VIEW IF EXISTS 'result'" in + ignore drop; + + Ok () + +(* + let query, _ = ImportAnalyser.Query.select output in + + let query = + { query with q = Printf.sprintf "CREATE VIEW result AS %s" query.q } + in + let** statement = execute_query db query in + + Sqlite3.step statement |> T.to_result + *) + +let check_foreign : + f:((string * CSV.DataType.t) array -> unit) -> + Sqlite3.db -> + Syntax.t -> + Syntax.extern -> + unit T.result = + fun ~f db conf external_ -> + let query = ImportAnalyser.Query.check_external conf external_ in + + let** statement = execute_query db query in + Sqlite3.iter statement ~f:(fun data -> + let values = + Array.mapi data ~f:(fun i value -> + (Sqlite3.column_name statement i, T.to_datatype value)) + in + f values) + |> T.to_result + +let clear_duplicates : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + ImportDataTypes.Table.t -> + ImportAnalyser.Dependency.key list -> + unit T.result = + fun ~f db table keys -> + let table_name = Table.name table in + + List.fold_left keys ~init:(Ok ()) + ~f:(fun state { ImportAnalyser.Dependency.name; _ } -> + let** _ = state in + + let select_query = + String.concat ~sep:"" + [ + "SELECT '"; + table_name; + "'.id, '"; + table_name; + "'.'key_"; + name; + "', '"; + name; + "' FROM '"; + table_name; + "' INNER JOIN (SELECT id, row_number() OVER(PARTITION BY '"; + table_name; + "'.'key_"; + name; + "' ORDER BY (id)) AS row_num from '"; + table_name; + "') other_table WHERE other_table.row_num <> 1 and '"; + table_name; + "'.id = other_table.id"; + ] + in + let stmt = Sqlite3.prepare db select_query in + + ignore + @@ Sqlite3.iter stmt ~f:(fun data -> + let values = + Array.mapi data ~f:(fun i value -> + (Sqlite3.column_name stmt i, T.to_datatype value)) + in + f values); + + let delete_query = + Printf.sprintf + {|UPDATE '%s' +SET key_%s = NULL +FROM +( + SELECT id, row_number() OVER(PARTITION BY key_%s ORDER BY (id)) AS row_num + from '%s' +) other_table +WHERE other_table.row_num <> 1 +and '%s'.id = other_table.id|} + table_name name name table_name table_name + in + + Sqlite3.exec db delete_query |> T.to_result) + +type 'a result = ('a, exn) Result.t diff --git a/lib/sql/db.mli b/lib/sql/db.mli new file mode 100644 index 0000000..465b159 --- /dev/null +++ b/lib/sql/db.mli @@ -0,0 +1,106 @@ +module Syntax = ImportConf.Syntax + +type 'a t +type 'a result = ('a, exn) Result.t + +val with_db : string -> ('a t -> unit result) -> unit result + +val check_table_schema : 'a t -> ImportAnalyser.Dependency.t -> bool result +(** Check if a table with the same structure already exists in the database. + + This query allow to reuse the same data without reloading the file if + nothing changed. *) + +val create_table : 'a t -> ImportAnalyser.Dependency.t -> unit result +(** [create_table db name] will create a new table in the + db with the given name, and the columns from the configuration (see + [ImportAnalyser.Query.create_table]) + + Any previous table with the same name will be deleted. *) + +val prepare_insert : 'a t -> ImportAnalyser.Dependency.t -> Sqlite3.stmt result +(** Create a statement to use in an insert. + [prepare_insert db table] will prepare a statement for inserting + the columns at the given index. *) + +val finalize : Sqlite3.stmt -> unit result +(** Finalize the statement. The function shall be called once each insert are + done, or after an error in the insert. *) + +val reset : Sqlite3.stmt -> unit result + +val eval_key : + 'a t -> + Sqlite3.stmt option -> + ImportAnalyser.Dependency.key list -> + (int * ImportCSV.DataType.t) list -> + (Sqlite3.stmt option * Sqlite3.Data.t list) result +(** Evaluate the keys in sqlite and get the results. + + The function is intended to check if the values are null before inserting + them in a batch *) + +val insert : + 'a t -> + Sqlite3.stmt -> + id:int -> + (int * ImportCSV.DataType.t) list -> + unit result +(** Insert a new row in the database. + + [insert db ~id statement values] will add a new row in the given table with + [id]. The columns are identified with their index number (there is a + difference by one with the column number) + + Thanks to SQLite Flexible Typing (https://www.sqlite.org/flextypegood.html) + each column can contain values typed differently which is how the spreadsheet + also works. + + This function is expected to be run inside a transaction. *) + +val begin_transaction : 'a t -> unit result +val commit : 'a t -> unit result +val rollback : 'a t -> unit result + +val query : + f: + ((ImportDataTypes.Path.t ImportExpression.T.t * ImportCSV.DataType.t) array -> + unit) -> + 'a t -> + Syntax.t -> + unit result +(** This one the most important function from the application. The function + will transform the configuration into an sql query and will fetch the + result from the sqlite engine. + + The function [f] given in argument will be called for each line + + *) + +val create_view : 'a t -> Syntax.t -> unit result +(** Create a view which represent the result *) + +val check_foreign : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + Syntax.t -> + Syntax.extern -> + unit result + +val clear_duplicates : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + ImportDataTypes.Table.t -> + ImportAnalyser.Dependency.key list -> + unit result +(** Remove all duplicated keys in the table by setting them to NULL. *) + +val insert_header : + 'a t -> + ImportDataTypes.Table.t -> + (int * ImportCSV.DataType.t) array -> + unit T.result + +val query_headers : + 'a t -> ImportDataTypes.Table.t -> ImportCSV.DataType.t array T.result +(** Get all the headers from the database (used or not) *) diff --git a/lib/sql/dune b/lib/sql/dune new file mode 100644 index 0000000..9f9f205 --- /dev/null +++ b/lib/sql/dune @@ -0,0 +1,15 @@ +(library + (name importSQL) + (libraries + re + sqlite3 + calendar + importAnalyser + importCSV + importConf + importContainers + importDataTypes + importErrors + importExpression + ) +) diff --git a/lib/sql/hashs.ml b/lib/sql/hashs.ml new file mode 100644 index 0000000..af1f092 --- /dev/null +++ b/lib/sql/hashs.ml @@ -0,0 +1,79 @@ +(** + This module store the hash of the indexes ([extern_key]) for each table in + order to update the file if the configuration changed. + + The hashes are stored in a table named [hashes] and are evaluated just + before inserting the values. +*) + +open StdLabels +module Table = ImportDataTypes.Table + +let ( let* ) = Result.bind + +let create_table : 'a T.t -> unit T.result = + fun db -> + Sqlite3.exec db + "CREATE TABLE IF NOT EXISTS 'hashes' ('table' TEXT, 'hash' INTEGER, \ + PRIMARY KEY ('table'))" + |> T.to_result + +let evaluate : ImportAnalyser.Dependency.t -> int = + fun table -> + (* Extract all the references to this table *) + let keys = + List.map (ImportAnalyser.Dependency.keys table) + ~f:(fun ImportAnalyser.Dependency.{ name; columns; expression } -> + ignore columns; + (name, expression)) + in + Hashtbl.hash keys + +let insert : 'a T.t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let table_name = Table.name source in + + let hash = evaluate table in + + let query = + String.concat ~sep:"" + [ + "INSERT INTO 'hashes' ('table', 'hash') VALUES ('"; + table_name; + "', :hash) ON CONFLICT(hashes.'table') DO UPDATE SET 'hash' = :hash"; + ] + in + let* statement = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + + let* _ = T.begin_transaction db in + let sql_data = Sqlite3.Data.INT (Int64.of_int hash) in + + let* _ = Sqlite3.bind_name statement ":hash" sql_data |> T.to_result in + let* _ = T.to_result (Sqlite3.step statement) in + T.commit db + +let query : 'a T.t -> ImportDataTypes.Table.t -> int option T.result = + fun db table -> + let table_name = Table.name table in + let query = + String.concat ~sep:"" + [ "SELECT hash FROM 'hashes' WHERE hashes.'table' = '"; table_name; "'" ] + in + + let* stmt = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + let state, res = + Sqlite3.fold stmt ~init:None ~f:(fun _ d -> + Some (T.to_datatype (Array.get d 0))) + in + + let* _ = T.to_result state in + match res with + | Some (ImportCSV.DataType.Integer i) -> Ok (Some i) + | _ -> Ok None diff --git a/lib/sql/header.ml b/lib/sql/header.ml new file mode 100644 index 0000000..3cac5fb --- /dev/null +++ b/lib/sql/header.ml @@ -0,0 +1,74 @@ +open StdLabels +module Table = ImportDataTypes.Table + +let ( let* ) = Result.bind + +let create_table : 'a T.t -> unit T.result = + fun db -> + Sqlite3.exec db + "CREATE TABLE IF NOT EXISTS 'header' ('table' TEXT, 'column' INTEGER, \ + 'label', PRIMARY KEY ('table', 'column'))" + |> T.to_result + +let insert_header : + 'a T.t -> + ImportDataTypes.Table.t -> + (int * ImportCSV.DataType.t) array -> + unit T.result = + fun db table values -> + let table_name = Table.name table in + + let query = + String.concat ~sep:"" + [ + "INSERT INTO 'header' ('table', 'column', 'label') VALUES ('"; + table_name; + "', :column, :label) ON CONFLICT(header.'table', header.'column') DO \ + UPDATE SET 'label' = :label"; + ] + in + + let statement = Sqlite3.prepare db query in + + let* _ = T.begin_transaction db in + let* _ = + Array.fold_left values ~init:(Ok ()) ~f:(fun acc (column, value) -> + let* _ = acc in + let sql_data = T.of_datatype value in + let* _ = Sqlite3.bind_name statement ":label" sql_data |> T.to_result in + + let* _ = + Sqlite3.bind_name statement ":column" + (Sqlite3.Data.INT (Int64.of_int column)) + |> T.to_result + in + let* _ = T.to_result (Sqlite3.step statement) in + T.reset statement) + in + T.commit db + +let query_headers : + 'a T.t -> ImportDataTypes.Table.t -> ImportCSV.DataType.t array T.result = + fun db table -> + let table_name = Table.name table in + let query = + String.concat ~sep:"" + [ + "SELECT label FROM 'header' WHERE header.'table' = '"; + table_name; + "' ORDER BY column DESC"; + ] + in + + let* stmt = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + let state, res = + Sqlite3.fold stmt ~init:[] ~f:(fun acc d -> + let value = T.to_datatype (Array.get d 0) in + value :: acc) + in + + let* _ = T.to_result state in + Ok (Array.of_list res) diff --git a/lib/sql/join.ml b/lib/sql/join.ml new file mode 100644 index 0000000..3f82b92 --- /dev/null +++ b/lib/sql/join.ml @@ -0,0 +1,30 @@ +module D = Sqlite3.Data + +let f : Sqlite3.Data.t array -> Sqlite3.Data.t = + fun arguments -> + if Array.length arguments < 2 then Sqlite3.Data.NULL + else + let sep = Array.get arguments 0 in + + (* Shift all the elements into an list*) + let contents = + Array.to_seqi arguments + |> Seq.filter_map (fun (i, value) -> + if i = 0 then None + else + match value with + | D.INT i -> Some (Int64.to_string i) + | D.FLOAT f -> Some (Float.to_string f) + | D.NONE -> None + | D.NULL -> None + | D.TEXT s | D.BLOB s -> + if String.length s = 0 then None else Some (String.trim s)) + |> List.of_seq + in + + D.TEXT (String.concat (D.to_string_coerce sep) contents) + +let register : Sqlite3.db -> unit = + fun db -> + Sqlite3.create_funN db "join" f; + Sqlite3.create_funN db "concat" f diff --git a/lib/sql/match.ml b/lib/sql/match.ml new file mode 100644 index 0000000..82fc1da --- /dev/null +++ b/lib/sql/match.ml @@ -0,0 +1,22 @@ +let f : Sqlite3.Data.t -> Sqlite3.Data.t -> Sqlite3.Data.t = + let memo = Hashtbl.create 16 in + fun str data_1 -> + match (str, data_1) with + | Sqlite3.Data.TEXT s, Sqlite3.Data.TEXT content -> ( + let regex = + match Hashtbl.find_opt memo s with + | None -> + let regex = Re.Posix.compile_pat s in + Hashtbl.add memo s regex; + regex + | Some v -> v + in + + match Re.exec_opt regex content with + | None -> Sqlite3.Data.NULL + | Some g -> + let matched = Re.Group.get g 1 in + Sqlite3.Data.TEXT matched) + | _, _ -> data_1 + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun2 db "match" f diff --git a/lib/sql/math.ml b/lib/sql/math.ml new file mode 100644 index 0000000..576d9f6 --- /dev/null +++ b/lib/sql/math.ml @@ -0,0 +1,20 @@ +(** Math functions *) + +let int : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + (* If the data is already an int, do not change it *) + | Sqlite3.Data.INT _ -> data + | Sqlite3.Data.FLOAT content -> Sqlite3.Data.INT (Int64.of_float content) + | Sqlite3.Data.BLOB content | Sqlite3.Data.TEXT content -> begin + match Int64.of_string_opt content with + | Some i -> Sqlite3.Data.INT i + | None -> begin + match Float.of_string_opt content with + | Some f -> Sqlite3.Data.INT (Int64.of_float f) + | None -> Sqlite3.Data.NULL + end + end + | _ -> Sqlite3.Data.NULL + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "int" int diff --git a/lib/sql/t.ml b/lib/sql/t.ml new file mode 100644 index 0000000..202c535 --- /dev/null +++ b/lib/sql/t.ml @@ -0,0 +1,52 @@ +exception SqlError of Sqlite3.Rc.t + +type 'a t = Sqlite3.db +type 'a result = ('a, exn) Result.t + +let to_result : Sqlite3.Rc.t -> unit result = function + | Sqlite3.Rc.OK | Sqlite3.Rc.DONE -> Ok () + | res -> Error (SqlError res) + +let begin_transaction : Sqlite3.db -> unit result = + fun db -> + let query = "BEGIN" in + Sqlite3.exec db query |> to_result + +let commit : Sqlite3.db -> unit result = + fun db -> + let query = "COMMIT" in + Sqlite3.exec db query |> to_result + +let rollback : Sqlite3.db -> unit result = + fun db -> + let query = "ROLLBACK" in + Sqlite3.exec db query |> to_result + +let savepoint : Sqlite3.db -> string -> unit result = + fun db name -> + let query = "SAVEPOINT " ^ name in + Sqlite3.exec db query |> to_result + +let release : Sqlite3.db -> string -> unit result = + fun db name -> + let query = "RELEASE SAVEPOINT " ^ name in + Sqlite3.exec db query |> to_result + +let finalize : Sqlite3.stmt -> unit result = + fun statement -> to_result (Sqlite3.finalize statement) + +let reset : Sqlite3.stmt -> unit result = + fun statement -> to_result (Sqlite3.reset statement) + +let of_datatype = function + | ImportCSV.DataType.Float f -> Sqlite3.Data.FLOAT f + | ImportCSV.DataType.Integer i -> Sqlite3.Data.INT (Int64.of_int i) + | ImportCSV.DataType.Null -> Sqlite3.Data.NULL + | ImportCSV.DataType.Error _ -> Sqlite3.Data.NULL + | ImportCSV.DataType.Content s -> Sqlite3.Data.TEXT s + +let to_datatype : Sqlite3.Data.t -> ImportCSV.DataType.t = function + | Sqlite3.Data.NONE | Sqlite3.Data.NULL -> ImportCSV.DataType.Null + | Sqlite3.Data.INT i -> ImportCSV.DataType.Integer (Int64.to_int i) + | Sqlite3.Data.FLOAT f -> ImportCSV.DataType.Float f + | Sqlite3.Data.TEXT t | Sqlite3.Data.BLOB t -> ImportCSV.DataType.Content t diff --git a/lib/sql/trim.ml b/lib/sql/trim.ml new file mode 100644 index 0000000..4e4bcf4 --- /dev/null +++ b/lib/sql/trim.ml @@ -0,0 +1,9 @@ +(* Override the trim function with another which also remove the retchar *) + +let f : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + | Sqlite3.Data.TEXT content -> Sqlite3.Data.TEXT (String.trim content) + | _ -> data + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "trim" f diff --git a/lib/sql/year.ml b/lib/sql/year.ml new file mode 100644 index 0000000..1e9c599 --- /dev/null +++ b/lib/sql/year.ml @@ -0,0 +1,19 @@ +(** Parse a text value into a date *) + +open CalendarLib + +let first_day = CalendarLib.Date.make 1899 12 30 + +let f : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + | Sqlite3.Data.INT content -> + let nb = Int64.to_int content in + let date = Date.add first_day (Date.Period.day nb) in + let year = CalendarLib.Date.year date in + + Sqlite3.Data.INT (Int64.of_int year) + | _ -> data + + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "year" f diff --git a/lib/tools/dune b/lib/tools/dune new file mode 100644 index 0000000..29ee744 --- /dev/null +++ b/lib/tools/dune @@ -0,0 +1,10 @@ +(library + (name tools)) + +(rule + (target git_hash.ml) + (deps git_head.sh (universe)) + (action + (with-stdout-to + %{target} + (bash "./git_head.sh")))) diff --git a/lib/tools/git_head.sh b/lib/tools/git_head.sh new file mode 100755 index 0000000..bfa738a --- /dev/null +++ b/lib/tools/git_head.sh @@ -0,0 +1,11 @@ +#! /bin/sh +# Include the git hash in an OCaml file. + +git diff-index --quiet HEAD -- +if [ $? -ne 0 ]; then + revision=": untracked" +else + revision=$(git rev-parse --short HEAD) +fi +compile_date=$(date +%Y/%m/%d) +echo "let revision = \"${revision} - compiled on ${compile_date}\"" diff --git a/readme.rst b/readme.rst new file mode 100644 index 0000000..ab7b5df --- /dev/null +++ b/readme.rst @@ -0,0 +1,668 @@ +.. -*- mode: rst -*- +.. -*- coding: utf-8 -*- + +.. role:: toml(code) + :language: toml + +.. default-role:: toml + +============================================ +Outil de construction des fichiers d’imports +============================================ + +.. raw:: html + + <style> + body { font-family: sans-serif; } + p, li { line-height: 1.5em; } + pre.code, code { + border: 1px solid #d2d3d7; + background-color: #f5f6f7; + } + pre.code { padding: 1em; margin: 2em 1.5em 2em 1.5em; } + code { display: inline-block; padding: 0.1em; } + table { border-collapse: collapse; } + thead { color: white; background-color: gray; } + td { border: none; } + table:not(.option-list) tr:not(:last-child):not(.field) { border-bottom: 1px solid lightgray; } + + .collapse_wrap > input { + display: none; + height: 0px; + } + + .collapse_wrap > label::before { + content: "\25B6"; + padding-right: 10px; + } + .collapse_wrap input:checked ~ label::before { + content: "\25BC"; + padding-right: 10px; + } + + .align-center { + display: block; + margin-left: auto; + margin-right: auto; + } + + .collapse_wrap .collapse { + overflow: hidden; + display: none; + transition: 0.5s; + box-shadow: 1px 2px 4px rgba(0, 0, 0, 0.3); + } + + .collapse_wrap > input:checked ~ .collapse { + display: block; + height: unset; + overflow: auto; + } + </style> + + + <script> + document.addEventListener("DOMContentLoaded", function() { + const collapsables = document.getElementsByClassName("collapse"); + for (let i = 0; i < collapsables.length; i++) { + var wrap = document.createElement("div"); + wrap.classList.add("collapse_wrap"); + + var radio = document.createElement("input"); + radio.setAttribute('type', 'checkbox'); + radio.setAttribute('id', 'radio_' + i); + + var label = document.createElement("label"); + label.setAttribute('for', 'radio_' + i); + label.innerHTML = "Afficher"; + + wrap.appendChild(radio); + wrap.appendChild(label); + + collapsables[i].parentNode.insertBefore(wrap, collapsables[i]); + wrap.appendChild(collapsables[i]); + } + }); + </script> + +.. contents:: + :depth: 2 + +L’application permet de construire un fichier d’import à partir d’une série de +règles indiquant comment assembler les données. L’outil est capable d’aller +chercher les données dans plusieurs fichiers différents et appliquer des règles +simples pour mettre en forme le résultat. + +Invocation +========== + +--conf + + Le chemin vers le fichier de configuration. Cet argument est obligatoire. + +--print-conf + + Écrit la configuration chargée sur la sortie courante et selon la dernière + syntaxe de l’application. + +.. --no-bom +.. +.. Ne pas insérer un BOM_ dans le fichier CSV. Cet indicateur permet +.. d’informer Excel que le fichier CSV est en UTF-8, ce qui devrait être le +.. cas. Dans le cas où cette chaine de caractère pose problème, il est +.. possible de désactiver ce mécanisme. +.. +.. .. _BOM: https://fr.wikipedia.org/wiki/Indicateur_d%27ordre_des_octets + +Sortie +====== + +L’exécution du programme va générer trois fichiers, qui sont tous basés sur le +nom du fichier de configuration : + + +sortie.csv + + Il s’agit du fichier produit par l’application, et dont les colonnes sont + celles définies dans le fichier de configuration. + +sortie.sqlite + + Il s’agit d’une base de données reprenant toutes les données utilisées pour + construire le résultat. + +sortie_error.csv + + Il s’agit d’un fichier CSV généré si l’application rencontre des erreurs. + Celles-ci concernent les liaisons entre les différents fichiers qui n’ont + pas pu être réalisées + +Fichier de configuration +======================== + +Les informations générales +-------------------------- + +version + Il s’agit de la version de la syntaxe du fichier de configuration. Valeur + attendue : `1` + +source + La clef `source` indique quel est le fichier source : pour chaque ligne + présente dans ce fichier, une ligne sera générée en sortie. + + :file: le fichier à charger + :tab: optionnellement l’onglet concerné + :name: le nom sous lequel le fichier sera associé. + + + +Le chargement des dépendances +----------------------------- + +La recherche de valeurs dans d’autres fichiers Excel nécessite de déclarer les +fichiers à charger. La liste `externals` décrit tous les fichiers qui doivent +être chargés, et comment les données doivent être associée avec le fichier +source. + +.. code:: toml + + [externals.target] + intern_key = ":source.A" + extern_key = ":A" + file = "financial.xlsx" + allow_missing = false + +Les clefs suivantes sont nécessaires pour lier les données venant d’un autre +fichier : + +intern_key + Il s’agit de la colonne servant à faire la liaison dans la source. +file + Le fichier à charger +tab + optionnellement l’onglet concerné +extern_key + Il s’agit de la colonne servant à faire la liaison dans le fichier devant + être lié. La clef doit doit etre unique, l’application supprimera les + doublons pour ne garder qu’une seule valeur par clef. Il n’est donc pas + possible de faire un produit cartésien. +allow_missing + Cette clef optionnelle indique s’il faut autoriser les valeurs manquantes + lors dans une liaison. Déclarer `allow_missing` va également autoriser les + doublons sur les valeurs de clef. + +Une fois la dépendance vers un fichier externe déclaré, il est possible +d’utiliser les colonnes de ce fichier en les référençant directement. Cela +revient à faire un `RECHERCHEV` dans Excel à chaque fois. + +Il est possible d’utiliser une référence externe dans une autre source externe, +afin de construire des chemins sur plusieurs niveaux : + +.. code:: toml + + [externals.acheteur_annuaire] + intern_key = ":I" + extern_key = ":A" + file = "ANNUAIRE.xlsx" + + [externals.acheteur_societe] + intern_key = ":acheteur_annuaire.BJ" + extern_key = ":A" + file = "SOCIETES.xlsx" + + +Les valeurs présentes dans ces colonnes sont pré-traitées pour éviter les +erreurs générales lors des imports : les espaces en fin de texte sont +supprimés et le texte est passé en capitale. + +Définition des colonnes +----------------------- + +Une fois les différents fichiers chargés, il est possible de construire le +schéma du fichier de sortie en listant toutes les colonnes à générer : + +.. code:: toml + + """function + ( :A ^ '_' ^ :target.E + , :E + )""" + +======================= ======================================= ====================== +Type d’élément Interprétation Exemple +======================= ======================================= ====================== +Texte Un texte libre. Celui-ci doit être `'_'` + placé entre quote (`'` ou `"`). + +Un nombre La valeur correspondante `1234` + +Une référence Une colonne. Celle-ci peut être définie `:A` + via un fichier externe ou directement + par rapport au fichier source. `:target.E` + + Elle ne doit pas être placée entre + quote. Le nom du fichier peut être omis + (et est alors considéré comme étant le + fichier source) + +Opérateur `^` Concaténation d’éléments `"contract_" ^ :A` + +Opérateur mathématique `+` `-` `/` `=` `<>` `:Q + 1` + `:A = 2000` + +Fonction Applique la fonction sur les `nvl(:I, :N, "Defaut")` + éléments donnés. La fonction ne doit + pas être placée entre quote `trim(:N)` + + La fonction prend en paramètre + les arguments sous forme de `trim("prefixe_" ^ :C)` + liste, séparés par des virgules. + +======================= ======================================= ====================== + + +.. class:: collapse + + :Operator: `+` | `-` | `<` | `>` | `/` | `^` + :Equality: `=` | `<>` + + :Reference: + | `:`\ COLUMN + | `:`\ TABLE\ `.`\ COLUMN + + :Expression: + | LITERAL + | NUMERIC + | *Reference* + | *Expression* *Operator* *Expression* + | *Expression* *Equality* *Expression* + | *Expression* *Equality* *Group* + | IDENTIFIER `(` *Expression*\ + `)` + | IDENTIFIER `(` *Expression*\ , *Group*, *Group* `)` + | IDENTIFIER `(` *Group*, *Group* `)` + :Group: + | `[` Expression* `]` + +Tri des données +--------------- + +.. code:: toml + + sort = [ + ":mouvements.B", + ":lib_titres.B", + "date('%d/%m/%Y', :mouvements.C)", + ":A", + ] + +Les données peuvent être triées à l’aide de la section `sort`. Il est possible +de définir plusieurs critères de tri qui seront traités par ordre de priorité. +Des fonctions peuvent également être utilisées (dans l’exemple ci-dessus, nous +avons un tri sur des dates au format dd/mm/yyyy, si la donnée n’était pas +convertie en date, elle serait triée sur le jour avant d’être triée sur +l’année). + +Filtrer les données +------------------- + +.. code:: toml + + filters = [ + ":t_operation.B <> ['Cession', 'Restitution', 'Prêt de titres']", + ":lib_titres.B <> 0", + ":societe.A <> 0", + ] + +Un filtre peut être mis en place pour ne conserver que les lignes souhaitées. +Les conditions doivent être toutes validées pour que la ligne soit retenue. + +.. note:: + + Il est possible de mettre une fonction de groupe dans les filtres, pour + n’extraire par exemple que la première ligne d’une série. Toutefois, cette + fonctionnalité est encore expérimentale. L’application peut lever une + erreur si la fonction est trop complexe pour etre interprétée. + + Si une fonction de groupe est présente, l’application ordonnera les + évaluations selon l’ordre suivant : + + 1. En premier lieu tous les filtres *simples* pour ne conserver que les + lignes correspondantes. + 2. Ensuite seulement la fonction de groupe présente dans la section `filter` + +Exemple complet +=============== + +Cet exemple (issu d’un cas réel), crée un fichier des personnes morales à +partir de plusieurs tables. + +- Le fichier source donne des identifiants pour les formes juridiques (qu’il + faut donc aller dans une table à part) et les devises + +- le numéro de TVA communautaire doit être chargé dans la table + immatriculation, la colonne A doit correspondre au code de la société, et la + colonne D doit avoir la valeur `4` + +- l’adresse est reconstituée en concaténant cinq colonnes depuis la table + annuaire. + +- la ville et le code postal peuvent être présent sur deux tables : la table + annuaire si l’adresse n’a pas été modifiée depuis la saisie initiale de la + fiche, ou la source qui contient la dernière valeur, mais peut être nulle si + celle-ci n’a pas été modifiée… + +.. code:: toml + + version = 1 + + [source] + file = "20220222_SOCIETES.xlsx" + name = "société" + + # Des commentaires peuvent etre ajoutés tout au long du fichier. + # Ici la table actif permet de faire une correspondance entre les valeurs + # du client et nos valeurs : + # 1 -> Actif + # 0 -> Inactif + [externals.actifs] + intern_key = ":E" + file = "actifs.xlsx" + extern_key = ":A" + allow_missing = false + + [externals.legalform] + intern_key = ":J" + file = "mapping.xlsx" + tab = 2 + extern_key = ":A" + allow_missing = false + + [externals.currency] + intern_key = ":BF" + file = "mapping.xlsx" + tab = 6 + extern_key = ":A" + allow_missing = false + + # le fichier annuaire contient des informations supplémentaires + [externals.annuaire] + intern_key = ":A" + file = "20220222_ANNUAIRE.xlsx" + extern_key = ":BJ" + allow_missing = false + + # La table des immatriculations. + # Seules les lignes avec la colonne D = 4 nous intéressent + [externals.immat] + intern_key = ":A ^ '_4'" + file = "20220222_SO_IMMATRICULATION.xlsx" + extern_key = ":B ^ '_' ^ :D" + allow_missing = true + + [sheet] + columns = [ + "'companyDirectory_' ^ :A", + ":G", + "'internalExternalList_Internal'", + ":actifs.B", + ":H", + ":I", + "", + ":legalform.S", + ":annuaire.I", + ":T", + ":BP", + ":currency.D", + "", + ":annuaire.CC", + ":CQ", + ":CO", + ":immat.C", + # Utilise la fonction join pour assembler les éléments + # De cette manière, si l’un des éléments est vide, il n’y aura pas de + # double espace (' ') dans le résultat + "join(' ', :annuaire.CP, :annuaire.CQ, :annuaire.CR, :annuaire.L, :annuaire.M)", + "nvl(:CM, :annuaire.N)", + "nvl(:CS, :annuaire.AB)", ":CR" + ] + filters = [] + sort = [] + uniq = [] + +Comparaison des valeurs +======================= + +Comparaison par groupe +---------------------- + +.. code:: toml + + ":t_operation.B <> ['Cession', 'Restitution', 'Prêt de titres']" + +Les opérateurs de comparaison `=` et `<>` peuvent accepter un groupe de valeur +plutot qu’une valeur unique. Dans ce cas, la condition est vraie si l’une des +valeurs est présente. + +Le cas des cellules vides +------------------------- + +.. code:: toml + + ":B > 0", + "'prefix_' ^ :B", + +Les cellules vides peuvent apparaitres dans des colonnes où l’on s’attend à +trouver des champs texte, ou des nombres. L’application va traiter ces valeurs +vides en fonction du contexte et les considérer comme `0` si elles sont +associées avec un numérique, ou `""` si elles sont associées à un texte. + + +Fonctions disponibles +===================== + +Liste non exhaustive. + +`if` + Choisi la valeur de la cellule en fonction d’une condition : + + .. code:: toml + + """if( + :E = 1 + , 'Vrai' + , 'Faux' + )""" + +`nvl` + Sélectionne la première valeur non nulle de la liste + + .. code:: toml + + "nvl(:I, :N, 'Defaut')" + +Fonctions sur les nombres +------------------------- + +`abs` + Renvoie la valeur absolue d’un nombre. + +`int` + Transforme une valeur en nombre entier. + +Fonctions sur le texte +---------------------- + +`join` + .. code:: toml + + "join('-', :A, :target.E, :B)" + + Concatène tous les champs en utilisant le premier paramètre comme + séparateur. Les valeurs vides sont ignorées, et dans ce cas le séparateur + n’est pas répété. + +`match` + Extrait un motif sur la base d’une `expression régulière`_. + + .. code:: toml + + "match('hello ([A-Za-z]+)', ':B')" + +.. _expression régulière: + https://fr.wikipedia.org/wiki/Expression_r%C3%A9guli%C3%A8re#Op%C3%A9rateurs + +`substring` + Extrait une sous-chaine en indiquant la position et la longueur. + + .. code:: toml + + "substring(:H, 1, 5)" + +`trim` + Supprime les espaces en début et fin de chaine + + .. code:: toml + + "trim(:C)" + +`upper` + Passe le texte en capitale + + .. code:: toml + + "upper(:A)" + +Fonctions sur les dates +----------------------- + +Normalement, les dates sont représentées dans Excel comme un nombre. On peut +donc utiliser les opérations standard (addition…) sans avoir à se poser de +question sur leur résultat. Toutefois, il est parfois nécessaire de faire des +opérations avancées. + + +`date` + Converti une date depuis le format texte. Le résultat de la fonction est le + nombre de jours écoulés depuis le 31 décembre 1899 (`1` correspond au 1er + janvier 1900). Cela correspond à la manière dont Excel enregistre une + valeur de type date. + + .. code:: toml + + "date('%d/%m/%Y', :B)" + + +`year` + Extrait l’année (sur 4 chiffres) à partir d’une date. Si la date est donnée + directement, il vaut mieux utiliser `substring` qui est plus simple + d’accès. `year` devient utile s’il y a des calculs associés : + + .. code:: toml + + "year( date('%d/%m/%Y', :M) + date('%d/%m/%Y', '31/12/1999') )", + + +Les fonctions de groupes +------------------------ + +Les fonctions suivantes permettent de grouper les données ou de les mettre en +correspondance avec d’autres lignes du fichier. + +Elles prennent se basent sur deux paramètres supplémentaires : + +1. le premier étant les colonnes à identifier pour faire le regroupement, +2. le second étant l’ordre dans lequel les lignes doivent être triées. + + `counter([regroupement, …],[tri, …])` + + Le tri est ignoré si le regroupement est omi. + + +`counter` + Crée un compteur qui s’incrémente tant que les lignes sont identiques. + + L’exemple suivant va remettre le compteur à `1` dès que la société ou le + titre change, et utilisera la date pour ordonner les valeurs dans ce + groupe. + + .. code:: toml + + """counter( + [:societe.H, :lib_titres.B], + [date('%d/%m/%Y', :mouvements.C), :A] + )""" + + + La fonction peut également être utilisée pour ne conserver que les lignes + uniques au sein d’une plage donnée (via un post-traitement où l’on ne garde + que les valeurs à `1`) + + .. code:: toml + + "counter([:C], [:A]) = 1" + + Il est possible de construire plusieurs compteurs sur des groupes et des + tris différents au sein d’un même fichier. + + Si le critère de tri n’est pas donné, l’application va compter le nombre + d’éléments pour chaque groupe: toutes les cellules auront le même résultat. + +`previous` + La fonction `previous` permet d’aller chercher l’élément précédent dans les + données. + + En fonction du critère de regroupement, la valeur renverra vide (si l’on + est sur groupe différent), et la valeur précédente sera recherchée en + fonction du critère de tri donné. + + `previous(expression, [regroupement, …], [tri, …])` + + .. code:: toml + + """previous( + :ths.G, + [:socs.EB, :socs.L, :ths.E], + [:ths.H])""" + +`sum` + La fonction `sum` permet de calculer la somme d’une colonne donnée sur un + critère de regroupement. + + Si le critère de tri n’est pas donné, l’application va calculer la somme + totale en fonction du critère de regroupement : toutes les cellules auront + le même résultat. + + `sum(expression, [regroupement, …], [])` + + .. code:: toml + + """sum( + :shares.K, + [:shares.A, :shares:D], + [])""" + + Au contraire, si un tri est donné, l’application va accumuler la somme tout + au long des valeurs rencontrées, et selon l’ordre du tri. + +`min` + La fonction `min` permet d’aller chercher le premier élément du groupe + (c’est à dire le plus petit selon l’ordre de tri) + + + `min(expression, [regroupement, …], [tri, …])` + + .. code:: toml + + """min( + :mouvements.C, + [:societe.A, :I], + [:mouvements.C])""" + + exemple d’utilisation : rechercher la première date d’achat. + +`max` + La fonction `max` permet d’aller chercher le dernier élément du groupe. + + `max(expression, [regroupement, …], [tri, …])` + diff --git a/tests/analyser_dependency.ml b/tests/analyser_dependency.ml new file mode 100644 index 0000000..dea7727 --- /dev/null +++ b/tests/analyser_dependency.ml @@ -0,0 +1,247 @@ +open OUnit2 +open StdLabels +module A = ImportAnalyser.Dependency +module Cont = ImportContainers +module Syntax = ImportConf.Syntax +module Expression = ImportExpression.T +module Table = ImportDataTypes.Table +open ConfLoader + +let test_order = + "Order" >:: fun _ -> + let order = A.get_process_order conf |> List.map ~f:A.table in + let expected_order = + [ external_table_last; external_table_other; external_table_source ] + in + assert_equal ~cmp:(cmp_list cmp_source) ~printer:show_sources expected_order + order + +let test_columns = + "Columns" >:: fun _ -> + let expected_colums = Cont.IntSet.of_list [ 1; 2 ] in + let columns = + A.get_process_order conf + |> List.find ~f:(fun v -> A.table v = external_table_source) + |> A.columns + in + + assert_equal + ~cmp:(fun a b -> 0 = Cont.IntSet.compare a b) + ~printer:Cont.show_intSet expected_colums columns + +let test_keys = + "Keys" >:: fun _ -> + (* We should have one key in the table other, because it is referenced as a + source in last file *) + let expected_keys = + A. + [ + { + name = "other"; + expression = Expression.Path 3; + columns = lazy (Cont.IntSet.singleton 3); + }; + ] + in + + let keys = + A.get_process_order conf + |> List.find ~f:(fun v -> A.table v = external_table_other) + |> A.keys + in + + assert_equal ~cmp:(cmp_list key_cmp) ~printer:keys_printer expected_keys keys + +let test_keys_missing = + "Keys missing" >:: fun _ -> + (* We have no key in last_file because the key is declared as missing *) + let expected_keys = + A. + [ + { + name = "last_file"; + expression = Expression.Path 3; + columns = lazy (Cont.IntSet.singleton 3); + }; + ] + in + + let keys = + A.get_process_order conf + |> List.find ~f:(fun v -> A.table v = external_table_last) + |> A.keys + in + + assert_equal ~cmp:(cmp_list key_cmp) ~printer:keys_printer expected_keys keys + +let test_unknow_source = + "Unknown source" >:: fun _ -> + let conf = { conf with externals = [] } in + assert_raises (ImportErrors.Unknown_source "last_file") (fun () -> + A.get_process_order conf) + +let test_unordered = + "Unorderd references" >:: fun _ -> + (* Externals not described in the right order shall not raise any + error. *) + let conf = + load + {|version = 1 +[source] + file = "source.xlsx" + name = "source" + +[externals.last_file] + intern_key = ":other.A" + file = "last.xlsx" + extern_key = ":C" + allow_missing = true + +[externals.other] + intern_key = ":A" + file = "other.xlsx" + extern_key = ":C" + allow_missing = false + +[sheet] + columns = []|} + in + assert_raises (ImportErrors.Unknown_source "other") (fun () -> + A.get_process_order conf) + +let test_circular = + "Unlinked reference" >:: fun _ -> + (* A reference to itself should be understood *) + let conf = + load + {|version = 1 +[source] + file = "source.xlsx" + name = "source" + +[externals.circular] + intern_key = ":circular.A" + file = "last.xlsx" + extern_key = ":A" + allow_missing = true + +[sheet] + columns = []|} + in + + let elements = A.get_process_order conf in + assert_equal ~printer:string_of_int 1 (List.length elements) + +let test_unlinked = + "Circular reference" >:: fun _ -> + (* An element linked to anything (except itself) should be ignored *) + let conf = + Syntax. + { + version = 1; + source = external_table_source; + externals = + [ + { + intern_key = Path { alias = Some "circular2"; column = 1 }; + target = { file = "other.xlsx"; tab = 1; name = "circular" }; + extern_key = Path 3; + allow_missing = true; + match_rule = None; + }; + { + intern_key = Path { alias = Some "circular"; column = 1 }; + target = { file = "other2.xlsx"; tab = 1; name = "circular2" }; + extern_key = Path 3; + allow_missing = true; + match_rule = None; + }; + ]; + columns = []; + filters = []; + sort = []; + uniq = []; + } + in + assert_raises (ImportErrors.Unknown_source "circular2") (fun () -> + A.get_process_order conf |> List.map ~f:A.table) + +let conf_with_unlinked = + Syntax. + { + version = 1; + source = external_table_source; + externals = + [ + { + intern_key = Path { alias = None; column = 1 }; + target = { file = "other.xlsx"; tab = 1; name = "other" }; + extern_key = Path 3; + allow_missing = false; + match_rule = None; + }; + ]; + columns = + [ + Concat [ Path { alias = None; column = 1 }; Literal "_"; Empty ]; + Path { alias = None; column = 2 }; + ]; + filters = []; + sort = []; + uniq = []; + } + +(** A table referenced only in a filter list shall be loaded correctly *) +let test_order_filter = + "Order filter" >:: fun _ -> + let order = + { + conf_with_unlinked with + filters = [ Path { alias = Some "other"; column = 5 } ]; + } + |> A.get_process_order |> List.map ~f:A.table + in + let expected_order = [ external_table_other; external_table_source ] in + assert_equal ~printer:show_sources expected_order order + +(** A table referenced only in the order list shall be loaded correctly *) +let test_order_sort = + "Order sort" >:: fun _ -> + let order = + { + conf_with_unlinked with + sort = [ Path { alias = Some "other"; column = 5 } ]; + } + |> A.get_process_order |> List.map ~f:A.table + in + let expected_order = [ external_table_other; external_table_source ] in + assert_equal ~printer:show_sources expected_order order + +(** A table referenced only in the uniq list shall be loaded correctly *) +let test_order_uniq = + "Order uniq" >:: fun _ -> + let order = + { + conf_with_unlinked with + uniq = [ Path { alias = Some "other"; column = 5 } ]; + } + |> A.get_process_order |> List.map ~f:A.table + in + let expected_order = [ external_table_other; external_table_source ] in + assert_equal ~printer:show_sources expected_order order + +let tests = + "analyser_dependency" + >::: [ + test_order; + test_columns; + test_keys; + test_keys_missing; + test_unknow_source; + test_unordered; + test_circular; + test_unlinked; + test_order_filter; + test_order_sort; + test_order_uniq; + ] diff --git a/tests/analyser_query_test.ml b/tests/analyser_query_test.ml new file mode 100644 index 0000000..3559de4 --- /dev/null +++ b/tests/analyser_query_test.ml @@ -0,0 +1,304 @@ +open OUnit2 +open StdLabels +module A = ImportAnalyser.Dependency +module Q = ImportAnalyser.Query +module C = ImportConf +module Syntax = ImportConf.Syntax +module Expr = Expression_builder + +let show_source (source : ImportDataTypes.Table.t) = + Printf.sprintf "%s:%d" source.ImportDataTypes.Table.file source.tab + +let show_sources sources = + let b = Buffer.create 16 in + Buffer.add_string b "["; + List.iter sources ~f:(fun source -> + Buffer.add_string b (show_source source); + Buffer.add_string b ","); + + let len = Buffer.length b in + if len > 1 then Buffer.truncate b (len - 1); + Buffer.add_string b "]"; + + Buffer.contents b + +(** This is sample configuration used in the tests *) +let conf = + Syntax. + { + version = 1; + source = { file = "source.xlsx"; tab = 1; name = "source" }; + externals = + [ + { + intern_key = Path { alias = None; column = 1 }; + target = { file = "other.xlsx"; tab = 1; name = "other" }; + extern_key = Path 3; + allow_missing = false; + match_rule = None; + }; + { + intern_key = Path { alias = Some "other"; column = 1 }; + target = { file = "last.xlsx"; tab = 1; name = "last_file" }; + extern_key = Path 3; + allow_missing = true; + match_rule = None; + }; + ]; + columns = + [ + Concat [ Path { alias = None; column = 1 }; Literal "_"; Empty ]; + Path { alias = None; column = 2 }; + Path { alias = Some "last_file"; column = 5 }; + ]; + filters = []; + sort = []; + uniq = []; + } + +let create_table = + "Create table" >:: fun _ -> + let out = A.get_process_order conf in + + let query = Q.create_table (List.hd out) in + + assert_equal ~printer:Fun.id + "CREATE TABLE 'last' (id INTEGER PRIMARY KEY,'key_last_file','col_5')" query + +let select = + "Select" >:: fun _ -> + let query, _ = Q.select conf in + let expected_query = + {|SELECT COALESCE('source'.col_1,'') || ? || '' AS result_0, +'source'.col_2 AS result_1, +'last_file'.col_5 AS result_2 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.col_1)) = 'other'.'key_other' +LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.col_1)) = 'last_file'.'key_last_file'|} + in + + assert_equal ~printer:Fun.id expected_query query.q + +let check_externals = + "Check external" >:: fun _ -> + let query = Q.check_external conf (List.hd conf.externals) in + + let expected_query = + "SELECT 'source'.'id', 'source'.col_1 FROM\n\ + 'source' AS 'source' LEFT JOIN 'other' AS 'other' ON \ + rtrim(upper('source'.col_1)) = 'other'.'key_other' WHERE \ + 'other'.'key_other' IS NULL AND 'source'.col_1 IS NOT NULL AND \ + 'source'.col_1 <> ''" + in + + assert_equal ~printer:Fun.id expected_query query.q + +let previous = + "Test window previous" >:: fun _ -> + (* This is sample configuration used in the tests *) + let conf = + Syntax. + { + version = 1; + source = { file = "source.xlsx"; tab = 1; name = "previous" }; + externals = []; + columns = + [ + Window + ( Previous (Path { alias = None; column = 5 }), + [ Path { alias = None; column = 1 } ], + [ Path { alias = None; column = 3 } ] ); + ]; + filters = []; + sort = []; + uniq = []; + } + in + + let res, _ = ImportAnalyser.Query.select conf in + let query = + "SELECT LAG('previous'.col_5) OVER (PARTITION BY 'previous'.col_1 ORDER BY \ + 'previous'.col_3) AS result_0\n\ + FROM 'source' AS 'previous'" + in + assert_equal ~printer:Fun.id query res.q + +let sum = + "Test window sum" >:: fun _ -> + (* This is sample configuration used in the tests *) + let conf = + Syntax. + { + version = 1; + source = { file = "source.xlsx"; tab = 1; name = "previous" }; + externals = []; + columns = + [ + Window + ( Sum (Path { alias = None; column = 5 }), + [ Path { alias = None; column = 1 } ], + [] ); + ]; + filters = []; + sort = []; + uniq = []; + } + in + + let res, _ = ImportAnalyser.Query.select conf in + let query = + "SELECT SUM('previous'.col_5) OVER (PARTITION BY 'previous'.col_1) AS \ + result_0\n\ + FROM 'source' AS 'previous'" + in + assert_equal ~printer:Fun.id query res.q + +let sum_total = + "Test sum over the whole range" >:: fun _ -> + (* This is sample configuration used in the tests *) + let conf = + Syntax. + { + version = 1; + source = { file = "source.xlsx"; tab = 1; name = "previous" }; + externals = []; + columns = [ Window (Sum (Path { alias = None; column = 5 }), [], []) ]; + filters = []; + sort = []; + uniq = []; + } + in + + let res, _ = ImportAnalyser.Query.select conf in + let query = + "SELECT SUM('previous'.col_5) AS result_0\nFROM 'source' AS 'previous'" + in + assert_equal ~printer:Fun.id query res.q + +let sum_unfiltered = + "Test sum over the whole range" >:: fun _ -> + (* This is sample configuration used in the tests *) + let conf = + Syntax. + { + version = 1; + source = { file = "source.xlsx"; tab = 1; name = "previous" }; + externals = []; + columns = + [ + Window + ( Sum (Path { alias = None; column = 5 }), + [], + [ Path { alias = None; column = 1 } ] ); + ]; + filters = []; + sort = []; + uniq = []; + } + in + + let res, _ = ImportAnalyser.Query.select conf in + let query = + "SELECT SUM('previous'.col_5) AS result_0\nFROM 'source' AS 'previous'" + in + assert_equal ~printer:Fun.id query res.q + +let prepare_insert = + "Test prepare_insert" >:: fun _ -> + let key = + ImportAnalyser.Dependency. + { + name = "key_test"; + expression = Concat [ Path 1; Literal "_"; Empty ]; + columns = lazy (ImportContainers.IntSet.singleton 1); + } + in + + let buffer = Buffer.create 16 in + let () = ImportAnalyser.Query.build_key_insert buffer key in + let contents = Buffer.contents buffer in + + let expected = "rtrim(upper(COALESCE(:col_1,'') || '_' || ''))" in + + assert_equal ~printer:Fun.id expected contents + +(** Test a request with a group in a filter. + +This generate a CTE expression in order to evaluate the group before loading +the results from the query. *) +let filter_group = + "Test filter_group" >:: fun _ -> + let c col = Expr.path ImportDataTypes.Path.{ alias = None; column = col } in + let conf = + { + conf with + columns = [ c 1 ]; + filters = [ Expr.(max (c 3) [ c 1 ] [ c 1 ]) ]; + } + in + let contents, _ = ImportAnalyser.Query.select conf in + + let expected = + {|WITH cte AS (SELECT source.id, LAST_VALUE('source'.col_3) OVER (PARTITION BY 'source'.col_1 ORDER BY 'source'.col_1 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS group0 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.col_1)) = 'other'.'key_other' +LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.col_1)) = 'last_file'.'key_last_file') +SELECT 'source'.col_1 AS result_0 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.col_1)) = 'other'.'key_other' +LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.col_1)) = 'last_file'.'key_last_file' +INNER JOIN 'cte' ON cte.id = source.id +WHERE (cte.group0)|} + in + + assert_equal ~printer:(fun s -> Printf.sprintf "\n%s" s) expected contents.q + +(** Test a request with a group in a filter. + +This generate a CTE expression in order to evaluate the group before loading +the results from the query. *) +let filter_group2 = + "Test filter_group" >:: fun _ -> + let c col = Expr.path ImportDataTypes.Path.{ alias = None; column = col } in + let conf = + { + conf with + columns = [ c 1 ]; + filters = + [ Expr.(max (c 3) [ c 1 ] [ c 1 ]); Expr.equal (c 3) Expr.integer_zero ]; + } + in + let contents, _ = ImportAnalyser.Query.select conf in + + let expected = + {|WITH cte AS (SELECT source.id, LAST_VALUE('source'.col_3) OVER (PARTITION BY 'source'.col_1 ORDER BY 'source'.col_1 RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS group0 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.col_1)) = 'other'.'key_other' +LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.col_1)) = 'last_file'.'key_last_file' +WHERE COALESCE('source'.col_3,0)=0) +SELECT 'source'.col_1 AS result_0 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.col_1)) = 'other'.'key_other' +LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.col_1)) = 'last_file'.'key_last_file' +INNER JOIN 'cte' ON cte.id = source.id +WHERE COALESCE('source'.col_3,0)=0 +AND (cte.group0)|} + in + + assert_equal ~printer:(fun s -> Printf.sprintf "\n%s" s) expected contents.q + +let test_suit = + [ + create_table; + select; + check_externals; + previous; + sum; + sum_total; + sum_unfiltered; + prepare_insert; + filter_group; + filter_group2; + ] + +let tests = "analyser_query_test" >::: test_suit diff --git a/tests/confLoader.ml b/tests/confLoader.ml new file mode 100644 index 0000000..266ff33 --- /dev/null +++ b/tests/confLoader.ml @@ -0,0 +1,128 @@ +open StdLabels + +(** Read the configuration in toml and return the internal representation *) +let load : string -> ImportConf.Syntax.t = + fun content -> + Otoml.Parser.from_string content |> ImportConf.t_of_toml |> Result.get_ok + +let conf = + load + {|version = 1 + +[source] + file = "source.xlsx" + name = "source" + +[externals.other] + intern_key = ":A" + file = "other.xlsx" + extern_key = ":C" + allow_missing = false + +[externals.last_file] + intern_key = ":other.A" + file = "last.xlsx" + extern_key = ":C" + allow_missing = true + +[sheet] + columns = [ + ":A ^ '_'", + ":B", + ":last_file.E", + ]|} + +let external_table_source = + ImportDataTypes.Table.{ file = "source.xlsx"; tab = 1; name = "source" } + +let external_table_other = + ImportDataTypes.Table.{ file = "other.xlsx"; tab = 1; name = "other" } + +let external_other = + ImportConf.Syntax. + { + intern_key = Path { alias = None; column = 1 }; + target = external_table_other; + extern_key = Path 3; + allow_missing = false; + match_rule = None; + } + +let external_table_last = + ImportDataTypes.Table.{ file = "last.xlsx"; tab = 1; name = "last_file" } + +let external_last = + ImportConf.Syntax. + { + intern_key = Path { alias = Some "other"; column = 1 }; + target = external_table_last; + extern_key = Path 3; + allow_missing = true; + match_rule = None; + } + +let show_source (source : ImportDataTypes.Table.t) = + Printf.sprintf "%s:%d" source.ImportDataTypes.Table.file + source.ImportDataTypes.Table.tab + +(* + * Compare two external sources + *) + +let show_sources sources = + let b = Buffer.create 16 in + Buffer.add_string b "["; + List.iter sources ~f:(fun source -> + Buffer.add_string b (show_source source); + Buffer.add_string b ","); + + let len = Buffer.length b in + if len > 1 then Buffer.truncate b (len - 1); + Buffer.add_string b "]"; + + Buffer.contents b + +and cmp_source : ImportDataTypes.Table.t -> ImportDataTypes.Table.t -> bool = + fun s1 s2 -> + String.equal s1.ImportDataTypes.Table.name s2.ImportDataTypes.Table.name + && String.equal s1.ImportDataTypes.Table.file s2.ImportDataTypes.Table.file + && s1.ImportDataTypes.Table.tab = s2.ImportDataTypes.Table.tab + +let cmp_list : ('a -> 'a -> bool) -> 'a list -> 'a list -> bool = + fun cmp elems1 elems2 -> List.for_all2 ~f:cmp elems1 elems2 + +(* + * Compare keys in the dependencies + *) + +let key_printer : ImportAnalyser.Dependency.key -> string = + fun { name; expression; _ } -> + let path_name = + let buffer = Buffer.create 16 in + ImportExpression.Headers.headers_of_expression buffer + (fun col buffer -> + Buffer.add_string buffer (ImportCSV.Csv.column_to_string col)) + expression; + Buffer.contents buffer + in + Printf.sprintf "%s, %s" name path_name + +and key_cmp a b = + 0 + = ImportExpression.T.cmp + (fun a b -> a - b) + a.ImportAnalyser.Dependency.expression + b.ImportAnalyser.Dependency.expression + +let keys_printer : ImportAnalyser.Dependency.key list -> string = + fun contents -> + let b = Buffer.create 16 in + List.iter contents ~f:(fun v -> Buffer.add_string b (key_printer v)); + Buffer.contents b + +(* + * Represents externals + *) + +let pp_externals : ImportConf.Syntax.extern list -> string = + fun ext -> ImportConf.Syntax.toml_of_externs ext |> Otoml.Printer.to_string diff --git a/tests/configuration/example_csv.toml b/tests/configuration/example_csv.toml new file mode 100644 index 0000000..24cee9b --- /dev/null +++ b/tests/configuration/example_csv.toml @@ -0,0 +1,31 @@ +version = 1 + +[source] + file = "importer.csv" + name = "source" + +[externals.target] + intern_key = ":source.A" + extern_key = ":A" + file = "financial.xlsx" + allow_missing = false + +[externals.a_financial] + intern_key = ":target.A" + extern_key = ":O" + file = "financial.xlsx" + allow_missing = false + +[sheet] + columns = [":A", + "concat(\"-\", :A, :target.E, :B)", + ":C", + "counter([:C],[:A])", + ":E", + "match(\"\\(..\\)\", :B)", + ":D", + "counter([:D],[:A])" + ] + filters = [] + sort = [] + uniq = [] diff --git a/tests/configuration/simple.toml b/tests/configuration/simple.toml new file mode 100644 index 0000000..d41383a --- /dev/null +++ b/tests/configuration/simple.toml @@ -0,0 +1,20 @@ +version = 1 + +[source] +name = "source_name" +file = "source_file" +tab = 1 + +[externals.target] +extern_key = "\"_B\"" +# Here, the values A & B are considered as column, and not litteral +intern_key = "function(:A, :B)" +allow_missing = true +file = "" +tab = 1 + +[sheet] +columns = [ + "function(:target.A, :B, 'free\\' text')", + "counter([:target.A],[:target.A])" +] diff --git a/tests/configuration_expression.ml b/tests/configuration_expression.ml new file mode 100644 index 0000000..a5c4755 --- /dev/null +++ b/tests/configuration_expression.ml @@ -0,0 +1,253 @@ +open StdLabels +open OUnit2 +module Expression = ImportExpression.T +module Path = ImportDataTypes.Path +open Path + +let printer = function + | Ok e -> ImportExpression.Repr.repr ImportConf.Path.repr e + | Error msg -> msg + +let parse_dquoted = + "parse_dquoted" >:: fun _ -> + let expr = "match(\"\\(..\\)\", :B)" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (Function + ("match", [ Literal "\\(..\\)"; Path { alias = None; column = 2 } ]))) + result + +let parse_quoted = + "parse_quoted" >:: fun _ -> + let expr = "match('\\(..\\)', :B)" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (Function + ("match", [ Literal "\\(..\\)"; Path { alias = None; column = 2 } ]))) + result + +let concat = + "concat" >:: fun _ -> + let expr = ":A ^ :B" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (Concat + [ + Path { alias = None; column = 1 }; Path { alias = None; column = 2 }; + ])) + result + +let concat2 = + "concat2" >:: fun _ -> + let expr = "'A' ^ '_' ^ 'B'" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok (Concat [ Literal "A"; Literal "_"; Literal "B" ])) + result + +let litteral = + "litteral" >:: fun _ -> + (* The text is quoted in shall not be considered as a path *) + let expr = "':A'" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Literal ":A")) result + +let empty = + "empty" >:: fun _ -> + let expr = "" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok Empty) result + +let upper_nvl = + "upper_nvl" >:: fun _ -> + let expr = "NVL('','')" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Nvl [ Empty; Empty ])) result + +let lower_nvl = + "lower_nvl" >:: fun _ -> + let expr = "nvl('','')" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Nvl [ Empty; Empty ])) result + +let numeric = + "numeric" >:: fun _ -> + let expr = "123" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Integer "123")) result + +let numeric_neg = + "numeric_neg" >:: fun _ -> + let expr = "-123" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Integer "-123")) result + +let op_priority = + "operator_priority" >:: fun _ -> + let expr = "1 + 2 > 2" in + let result = ImportConf.expression_from_string expr + and expected = + ImportExpression.T.( + BOperator (GT, BOperator (Add, Integer "1", Integer "2"), Integer "2")) + in + + assert_equal ~printer (Ok expected) result + +let op_priority2 = + "operator_priority" >:: fun _ -> + let expr = "1 ^ 2 = 2" in + let result = ImportConf.expression_from_string expr + and expected = + ImportExpression.T.( + BOperator (Equal, Concat [ Integer "1"; Integer "2" ], Integer "2")) + in + + assert_equal ~printer (Ok expected) result + +let join = + "join" >:: fun _ -> + let expr = "join('sep', :A, :B)" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (Join + ( "sep", + [ + Path { alias = None; column = 1 }; + Path { alias = None; column = 2 }; + ] ))) + result + +let join_empty = + "join" >:: fun _ -> + let expr = "join('', :A, :B)" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (Join + ( "", + [ + Path { alias = None; column = 1 }; + Path { alias = None; column = 2 }; + ] ))) + result + +let upper = + "upper" >:: fun _ -> + let expr = "upper('')" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Function' (Upper, [ Empty ]))) result + +let trim = + "trim" >:: fun _ -> + let expr = "trim('')" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer (Ok (Function' (Trim, [ Empty ]))) result + +(** Extract the columns from a window function *) +let fold_values = + "fold_values" >:: fun _ -> + (* The expression we want to test *) + let expr = + Expression.Window + ( Previous (Path { alias = None; column = 1 }), + [ Path { alias = None; column = 2 } ], + [ Path { alias = None; column = 3 } ] ) + in + + (* Extract the columns from the expression. The result is sorted because + the order is not preserved during the extraction. *) + let result = + Expression.fold_values ~init:[] ~f:(fun acc v -> v :: acc) expr + |> List.sort ~cmp:Path.compare + in + + let expected = + [ + { alias = None; column = 1 }; + { alias = None; column = 2 }; + { alias = None; column = 3 }; + ] + in + + assert_equal expected result + +let bad_quote = + "bad_quote" >:: fun _ -> + let expr = "':source.A" in + let result = ImportConf.expression_from_string expr in + + assert_equal ~printer (Error "Unclosed quote at line 1 : \"':source.A\"") + result + +let nested_expression = + "nested_expression" >:: fun _ -> + let expr = "1 = (1 = 0)" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (BOperator + ( Equal, + Integer "1", + Expr (BOperator (Equal, Integer "1", Integer "0")) ))) + result + +let priority_equality = + "priority_equality" >:: fun _ -> + let expr = "1 = 1 = 0" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (BOperator + (Equal, Integer "1", BOperator (Equal, Integer "1", Integer "0")))) + result + +let priority_operator_and = + "priority_equality" >:: fun _ -> + let expr = "1 and 1 = 0" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (BOperator (And, Integer "1", BOperator (Equal, Integer "1", Integer "0")))) + result + +let priority_operator_or = + "priority_equality" >:: fun _ -> + let expr = "1 <> 1 or 0" in + let result = ImportConf.expression_from_string expr in + assert_equal ~printer + (Ok + (BOperator + (Or, BOperator (Different, Integer "1", Integer "1"), Integer "0"))) + result + +let test_suit = + [ + parse_dquoted; + parse_quoted; + concat; + concat2; + litteral; + empty; + upper_nvl; + lower_nvl; + numeric; + numeric_neg; + op_priority; + op_priority2; + join; + upper; + trim; + join_empty; + fold_values; + bad_quote; + nested_expression; + priority_equality; + priority_operator_and; + priority_operator_or; + ] + +let tests = "configuration_expression" >::: test_suit diff --git a/tests/configuration_toml.ml b/tests/configuration_toml.ml new file mode 100644 index 0000000..3c8bfc2 --- /dev/null +++ b/tests/configuration_toml.ml @@ -0,0 +1,71 @@ +open OUnit2 +module Expression = ImportExpression.T +module Path = ImportDataTypes.Path + +let test_suit = + [ + ( "parse_extern" >:: fun _ -> + let toml = Otoml.Parser.from_file "configuration/simple.toml" in + let toml = ImportConf.t_of_toml toml in + match toml with + | Error s -> raise (Failure s) + | Ok result -> + let open ImportConf.Syntax in + let expected = + { + target = { file = ""; tab = 1; name = "target" }; + extern_key = Literal "_B"; + intern_key = + Function + ( "function", + [ + Path { alias = None; column = 1 }; + Path { alias = None; column = 2 }; + ] ); + match_rule = None; + allow_missing = true; + } + in + + let printer s = + String.concat "," (List.map ImportConf.print_extern s) + in + + assert_equal ~printer [ expected ] result.externals ); + ( "parse_columns" >:: fun _ -> + let toml = Otoml.Parser.from_file "configuration/simple.toml" in + let toml = ImportConf.t_of_toml toml in + + match toml with + | Error s -> raise (Failure s) + | Ok result -> + let open Path in + let open Expression in + let expected = + [ + Function + ( "function", + [ + Path { alias = Some "target"; column = 1 }; + Path { alias = None; column = 2 }; + Literal "free' text"; + ] ); + Window + ( Counter, + [ Path { alias = Some "target"; column = 1 } ], + [ Path { alias = Some "target"; column = 1 } ] ); + ] + in + + List.iter2 + (fun expected result -> + assert_equal ~printer:ImportConf.print_path_expression expected + result) + expected result.columns ); + ( "parse_csv" >:: fun _ -> + let toml = Otoml.Parser.from_file "configuration/example_csv.toml" in + let toml = ImportConf.t_of_toml toml in + ignore toml ); + ] + +let tests = "configuration_toml" >::: test_suit diff --git a/tests/dune b/tests/dune new file mode 100644 index 0000000..c777aef --- /dev/null +++ b/tests/dune @@ -0,0 +1,17 @@ +(test + (name importer_test) + (deps (source_tree configuration)) + (libraries + ounit2 + otoml + ppx_deriving.runtime + sqlite3 + importConf + importAnalyser + importContainers + importCSV + importDataTypes + importErrors + importExpression + importSQL +)) diff --git a/tests/expression_builder.ml b/tests/expression_builder.ml new file mode 100644 index 0000000..fd9a17f --- /dev/null +++ b/tests/expression_builder.ml @@ -0,0 +1,47 @@ +module T = ImportExpression.T + +let empty : 'a T.t = T.Empty +let path : 'a -> 'a T.t = fun v -> T.Path v +let literal_test : 'a T.t = T.Literal "test" +let literal_quoted : 'a T.t = T.Literal "'" +let literal_zero : 'a T.t = T.Literal "0" +let integer_zero : 'a T.t = T.Integer "0" +let integer_one : 'a T.t = T.Integer "1" +let concat : 'a T.t = T.Concat [ T.Empty; T.Literal "test" ] +let expr : 'a T.t = T.Function ("expr", [ literal_test; T.Literal "NOT NULL" ]) + +let equal : 'a T.t -> 'a T.t -> 'a T.t = + fun e1 e2 -> T.BOperator (T.Equal, e1, e2) + +let different : 'a T.t -> 'a T.t -> 'a T.t = + fun e1 e2 -> T.BOperator (T.Different, e1, e2) + +let divide : 'a T.t -> 'a T.t -> 'a T.t = + fun e1 e2 -> T.BOperator (T.Division, e1, e2) + +let nvl : 'a T.t -> 'a T.t -> 'a T.t = fun e1 e2 -> T.Nvl [ e1; e2 ] + +let if_ : 'a T.t -> 'a T.t -> 'a T.t -> 'a T.t = + fun pred e1 e2 -> T.Function ("if", [ pred; e1; e2 ]) + +let in_ : 'a T.t -> 'a T.t list -> 'a T.t = + fun e1 group -> T.GEquality (T.Equal, e1, group) + +let not_in : 'a T.t -> 'a T.t list -> 'a T.t = + fun e1 group -> T.GEquality (T.Different, e1, group) + +let max : 'a T.t -> 'a T.t list -> 'a T.t list -> 'a T.t = + fun e group order -> T.Window (T.Max e, group, order) + +let counter : 'a T.t list -> 'a T.t list -> 'a T.t = + fun group order -> T.Window (T.Counter, group, order) + +let function' : T.funct -> 'a T.t list -> 'a T.t = + fun name param -> T.Function' (name, param) + +module Make (Sym : ImportExpression.Sym.SYM_EXPR) = struct + module M = ImportExpression.Sym.M (Sym) + + let eval : 'a T.t -> path_repr:'b Sym.path_repr -> 'a Sym.repr = + fun v ~path_repr -> M.eval ~path_repr v +end diff --git a/tests/expression_query.ml b/tests/expression_query.ml new file mode 100644 index 0000000..d260a76 --- /dev/null +++ b/tests/expression_query.ml @@ -0,0 +1,208 @@ +open OUnit2 +module T = ImportExpression.T +module Expr = Expression_builder +module M = Expr.Make (ImportExpression.Query.Query) + +let eval = + M.eval ~path_repr:(fun formatter n -> Format.fprintf formatter "%s" n) + +let printer = Fun.id + +let test_expr ?(nested = ImportExpression.Query.QueryParameter.Literal) expr = + let buffer = Buffer.create 16 in + let formatter = Format.formatter_of_buffer buffer in + let () = ImportExpression.Query.Query.observe expr formatter ~nested in + Format.pp_print_flush formatter (); + Buffer.contents buffer + +let empty = + "empty" >:: fun _ -> + let expr = eval Expr.empty in + let content = test_expr expr and expected = "''" in + + assert_equal ~printer expected content + +let litteral = + "literal" >:: fun _ -> + let expr = eval Expr.literal_test in + let content = test_expr expr and expected = "'test'" in + + assert_equal ~printer expected content + +let litteral_quoted = + "literal_quoted" >:: fun _ -> + let expr = eval Expr.literal_quoted in + let content = test_expr expr and expected = "'\''" in + + assert_equal ~printer expected content + +let litteral_raw = + "literal_raw" >:: fun _ -> + let expr = eval Expr.literal_test in + let nested = ImportExpression.Query.QueryParameter.(Raw Literal) in + let content = test_expr expr ~nested and expected = "test" in + + assert_equal ~printer expected content + +let path = + "path" >:: fun _ -> + (* In the path, the given function do all the job *) + let expr = eval @@ Expr.path "test" in + let content = test_expr expr and expected = "test" in + + assert_equal ~printer expected content + +let concat = + "concat" >:: fun _ -> + let expr = eval Expr.concat in + let content = test_expr expr and expected = "'' || 'test'" in + + assert_equal ~printer expected content + +let nvl = + "nvl" >:: fun _ -> + let expr = eval @@ Expr.nvl Expr.empty Expr.literal_test in + let content = test_expr expr and expected = "COALESCE('', 'test')" in + + assert_equal ~printer expected content + +let upper = + "upper" >:: fun _ -> + let expr = eval @@ Expr.function' T.Upper [ Expr.literal_test ] in + let content = test_expr expr and expected = "UPPER('test')" in + + assert_equal ~printer expected content + +let join = + "join" >:: fun _ -> + let expr = + ImportExpression.Query.Query.( + join "," [ eval Expr.empty; eval Expr.literal_test ]) + in + let content = test_expr expr and expected = "CONCAT(',', '', 'test')" in + + assert_equal ~printer expected content + +let boperator_eq = + "boperator_eq" >:: fun _ -> + let expr = eval @@ Expr.equal Expr.empty Expr.literal_test in + let content = test_expr expr and expected = "''='test'" in + + assert_equal ~printer expected content + +let boperator_div = + "boperator_div" >:: fun _ -> + let expr = eval @@ Expr.divide Expr.integer_one Expr.integer_zero in + let content = test_expr expr and expected = "CAST(1 AS REAL)/0" in + + assert_equal ~printer expected content + +let boperator_neq = + "boperator_neq" >:: fun _ -> + let expr = eval @@ Expr.different Expr.empty Expr.literal_test in + let content = test_expr expr and expected = "''<>'test'" in + + assert_equal ~printer expected content + +let expr = + "expr" >:: fun _ -> + let expr = eval Expr.expr in + let content = test_expr expr and expected = "(test NOT NULL)" in + + assert_equal ~printer expected content + +let unify_int = + "unify_int" >:: fun _ -> + let expr = eval @@ Expr.equal (Expr.path "external") Expr.integer_zero in + let content = test_expr expr and expected = "COALESCE(external,0)=0" in + + assert_equal ~printer expected content + +let unify_string = + "unify_string" >:: fun _ -> + let expr = eval @@ Expr.equal (Expr.path "external") Expr.literal_zero in + let content = test_expr expr and expected = "COALESCE(external,'')='0'" in + + assert_equal ~printer expected content + +let in_string = + "in_string" >:: fun _ -> + let expr = eval @@ Expr.in_ (Expr.path "external") [ Expr.literal_zero ] in + let content = test_expr expr and expected = "COALESCE(external,'') IN('0')" in + assert_equal ~printer expected content + +let not_in_string = + "in_string" >:: fun _ -> + let expr = eval @@ Expr.not_in (Expr.path "external") [ Expr.literal_zero ] in + let content = test_expr expr + and expected = "COALESCE(external,'') NOT IN('0')" in + assert_equal ~printer expected content + +(* Evaluate the max function *) +let max = + "max" >:: fun _ -> + let expr = + eval @@ Expr.(max (path ":C") [ path ":A" ] [ path ":A"; path ":B" ]) + in + + let content = test_expr expr + and expected = + "LAST_VALUE(:C) OVER (PARTITION BY :A ORDER BY :A, :B RANGE BETWEEN \ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)" + in + + assert_equal ~printer expected content + +let in_int = + "in_int" >:: fun _ -> + let expr = + eval + @@ Expr.in_ (Expr.path "external") [ Expr.integer_zero; Expr.integer_one ] + in + let content = test_expr expr and expected = "COALESCE(external,0) IN(0, 1)" in + assert_equal ~printer expected content + +let counter_no_order = + "counter_no_order" >:: fun _ -> + let expr = eval @@ Expr.(counter [ path ":A" ] []) in + + let content = test_expr expr + and expected = "COUNT() OVER (PARTITION BY :A)" in + + assert_equal ~printer expected content + +let counter_order = + "counter_no_order" >:: fun _ -> + let expr = eval @@ Expr.(counter [ path ":A" ] [ path ":B" ]) in + + let content = test_expr expr + and expected = "ROW_NUMBER() OVER (PARTITION BY :A ORDER BY :B)" in + + assert_equal ~printer expected content + +let test_suit = + [ + empty; + litteral; + litteral_quoted; + litteral_raw; + path; + concat; + nvl; + upper; + join; + boperator_eq; + boperator_neq; + boperator_div; + expr; + unify_int; + unify_string; + in_string; + not_in_string; + in_int; + max; + counter_no_order; + counter_order; + ] + +let tests = "expression_query" >::: test_suit diff --git a/tests/expression_repr.ml b/tests/expression_repr.ml new file mode 100644 index 0000000..9ad321a --- /dev/null +++ b/tests/expression_repr.ml @@ -0,0 +1,37 @@ +open OUnit2 +module Expression = ImportExpression.T +module Path = ImportDataTypes.Path +module Expr = Expression_builder +module M = Expr.Make (ImportExpression.Repr.E) + +let eval = M.eval ~path_repr:ImportCSV.Csv.column_to_string +let test_expr expr = ImportExpression.Repr.E.observe ~top:true expr + +let printer = function + | Ok e -> ImportConf.print_path_expression e + | Error msg -> msg + +let print_literal = + "print_litteral" >:: fun _ -> + let result = test_expr @@ eval (Literal "Content") in + assert_equal ~printer:Fun.id "'Content'" result + +let print_quoted_literal = + "print_quoted_literal" >:: fun _ -> + let result = test_expr @@ eval Expression_builder.literal_quoted in + assert_equal ~printer:Fun.id "'\\''" result + +let print_dquoted_literal = + "print_dquoted_literal" >:: fun _ -> + let result = test_expr @@ eval (Literal "\"") in + assert_equal ~printer:Fun.id "'\"'" result + +let print_numeric = + "print_numeric" >:: fun _ -> + let result = test_expr @@ eval (Literal "123") in + assert_equal ~printer:Fun.id "123" result + +let test_suit = + [ print_literal; print_quoted_literal; print_dquoted_literal; print_numeric ] + +let tests = "expression_repr" >::: test_suit diff --git a/tests/expression_type_of.ml b/tests/expression_type_of.ml new file mode 100644 index 0000000..706b3e7 --- /dev/null +++ b/tests/expression_type_of.ml @@ -0,0 +1,65 @@ +open OUnit2 +module T = ImportExpression.T +module Types = ImportDataTypes.Types +module Expr = Expression_builder +module M = Expr.Make (ImportExpression.Type_of) + +let eval = M.eval ~path_repr:(fun _ -> ()) +let printer = Types.string_of_t +let test_expr expr = ImportExpression.Type_of.observe expr + +let empty = + "empty" >:: fun _ -> + let expr = eval Expr.empty in + let content = test_expr expr and expected = Types.None in + + assert_equal ~printer expected content + +(** Control an if statement with a predicate which is not a boolean *) +let invalid_if = + "invalid_if" >:: fun _ -> + (* The expression we want to test *) + let raw_expr = + Expr.if_ Expr.literal_test Expr.integer_zero Expr.integer_one + in + + let exn = + ImportErrors.TypeError + { + expected = Types.Bool; + actual = Types.String; + expression = ImportExpression.Repr.repr Fun.id raw_expr; + subset = "the predicate"; + } + in + assert_raises exn (fun () -> + let expr = eval raw_expr in + test_expr expr) + +(** The application should infer that the expression is a string *) +let valid_if = + "valid_if" >:: fun _ -> + let expr = + eval + @@ Expr.if_ + (Expr.equal Expr.integer_one Expr.integer_zero) + Expr.literal_test Expr.literal_test + in + let content = test_expr expr and expected = Types.String in + assert_equal ~printer expected content + +let upper = + "upper" >:: fun _ -> + let expr = eval @@ Expr.function' T.Upper [Expr.literal_test] in + + let content = test_expr expr and expected = Types.String in + assert_equal ~printer expected content + +let in_int = + "in_int" >:: fun _ -> + let expr = eval @@ Expr.in_ (Expr.path "external") [ Expr.integer_one ] in + let content = test_expr expr and expected = Types.Bool in + assert_equal ~printer expected content + +let tests = "expression_type_of" >::: [ empty; invalid_if; valid_if; in_int ; +upper ] diff --git a/tests/importCSV_test.ml b/tests/importCSV_test.ml new file mode 100644 index 0000000..bc21992 --- /dev/null +++ b/tests/importCSV_test.ml @@ -0,0 +1,29 @@ +open OUnit2 +open ImportCSV + +let test_suit = + [ + ( "Column A" >:: fun _ -> + assert_equal + ~printer:(fun i -> Printf.sprintf "%d (%s)" i (Csv.column_to_string i)) + 1 (Csv.column_of_string "A") ); + ( "Column a" >:: fun _ -> + assert_equal + ~printer:(fun i -> Printf.sprintf "%d (%s)" i (Csv.column_to_string i)) + 1 (Csv.column_of_string "a") ); + ( "Column name" >:: fun _ -> + let () = + for i = 1 to 1_000 do + let column_name = Csv.column_to_string i in + let column_index = Csv.column_of_string column_name in + + assert_equal + ~printer:(fun i -> + Printf.sprintf "%d (%s)" i (Csv.column_to_string i)) + i column_index + done + in + () ); + ] + +let tests = "importCSV_test" >::: test_suit diff --git a/tests/importConf_test.ml b/tests/importConf_test.ml new file mode 100644 index 0000000..c94eb91 --- /dev/null +++ b/tests/importConf_test.ml @@ -0,0 +1,23 @@ +open OUnit2 +open ConfLoader + +(** Test the dependencies extracted from the external named "source". + + Refer to the default configuration used in [ConfLoader] to see the + configuration. + *) +let test_get_dependencies_for_source = + "get_dependancies_for_table" >:: fun _ -> + let result = ImportConf.get_dependancies_for_table conf conf.source + and expected = [ external_other ] in + assert_equal ~printer:pp_externals expected result + +let test_get_dependencies_for_other = + "get_dependancies_for_table" >:: fun _ -> + let result = ImportConf.get_dependancies_for_table conf external_table_other + and expected = [ external_last ] in + assert_equal ~printer:pp_externals expected result + +let tests = + "importConf_test" + >::: [ test_get_dependencies_for_source; test_get_dependencies_for_other ] diff --git a/tests/importer_test.ml b/tests/importer_test.ml new file mode 100644 index 0000000..16ea663 --- /dev/null +++ b/tests/importer_test.ml @@ -0,0 +1,21 @@ +open OUnit2 + +let _ = + run_test_tt_main + ("importer_tests" + >::: [ + ImportCSV_test.tests; + Sql_date.tests; + Sql_match.tests; + Sql_int.tests; + Sql_trim.tests; + ImportConf_test.tests; + Configuration_toml.tests; + Configuration_expression.tests; + Expression_repr.tests; + Expression_type_of.tests; + Expression_query.tests; + Analyser_dependency.tests; + Analyser_query_test.tests; + Sql_db.tests; + ]) diff --git a/tests/sql_date.ml b/tests/sql_date.ml new file mode 100644 index 0000000..4becdf5 --- /dev/null +++ b/tests/sql_date.ml @@ -0,0 +1,18 @@ +open OUnit2 + +let test_suit = + [ + ( "Parse date" >:: fun _ -> + let text_date = Sqlite3.Data.TEXT "2002-04-08 15:59:41.000" + and format_date = Sqlite3.Data.TEXT "%Y-%m-%d %H:%M:%S.000" in + + assert_equal (Sqlite3.Data.INT 37354L) + (ImportSQL.Date.f format_date text_date) ); + ( "Parse date as int" >:: fun _ -> + let int_date = Sqlite3.Data.INT 37354L + and format_date = Sqlite3.Data.TEXT "%Y-%m-%d %H:%M:%S.000" in + + assert_equal int_date (ImportSQL.Date.f format_date int_date) ); + ] + +let tests = "sql_date" >::: test_suit diff --git a/tests/sql_db.ml b/tests/sql_db.ml new file mode 100644 index 0000000..c966f4e --- /dev/null +++ b/tests/sql_db.ml @@ -0,0 +1,198 @@ +(** Test the behavior of the sqlite with a in-memory database *) + +open OUnit2 +open StdLabels + +let ( let* ) res cont = + match res with + | Ok value -> cont value + | Error e -> raise e + +(** Test a process with a simple configuration in-memory *) +let run_test ~configuration ~input ~expected name = + name >:: fun _ -> + (* We expect a valid configuration *) + let conf = + ImportConf.t_of_toml (Otoml.Parser.from_string configuration) + |> Result.get_ok + in + + let exec db = + let table = List.hd @@ ImportAnalyser.Dependency.get_process_order conf in + let* () = ImportSQL.Db.create_table db table in + + (* Prepare the statement in order to import data *) + let* stmt = ImportSQL.Db.prepare_insert db table in + + (* Inject some data into the table *) + let result, _ = + List.fold_left ~init:(Ok (), 0) input ~f:(fun (_, i) data -> + let result = + let* () = ImportSQL.Db.insert ~id:i db stmt data in + let* () = ImportSQL.Db.reset stmt in + Ok () + in + (result, i + 1)) + in + let* () = result in + + let* () = ImportSQL.Db.finalize stmt in + + let expected = ref expected in + + (* Collect the data *) + let* () = + ImportSQL.Db.query db conf ~f:(fun rows -> + match !expected with + | [] -> () + | hd :: tl -> + expected := tl; + let () = + Array.iter2 rows hd ~f:(fun (_, value) expected -> + assert_equal ~printer:ImportCSV.DataType.to_string value + expected) + in + ()) + in + + Ok () + in + + (* Use a magic keyword for in-memory database *) + ignore @@ ImportSQL.Db.with_db ":memory:" exec + +(** Simple test used to check the process *) +let simple_extraction = + run_test "simple_extraction" + ~configuration: + {|version = 1 + +[source] +name = "source_name" +file = "source_file" + +[sheet] +columns = [ + ":A ^ '_'", + ":B", + ":E"]|} + ~input: + [ + [ + (0, ImportCSV.DataType.Integer 123); + (1, ImportCSV.DataType.Integer 2); + (4, ImportCSV.DataType.Integer 5); + ]; + ] + ~expected: + [ + [| + ImportCSV.DataType.Content "123_"; + ImportCSV.DataType.Integer 2; + ImportCSV.DataType.Integer 5; + |]; + ] + +(** Ensure the behavior of the sum function when a filter is given. It is + expected to accumulate the values over each line *) +let sum_sort = + run_test "sum_sort" + ~configuration: + {|version = 1 + +[source] +name = "source_name" +file = "source_file" + +[sheet] +columns = [ + ":A", + "sum(:C, [:B], [:A])", +]|} + ~input: + [ + [ + (0, ImportCSV.DataType.Integer 1); + (1, ImportCSV.DataType.Content "A"); + (2, ImportCSV.DataType.Integer 100); + ]; + [ + (0, ImportCSV.DataType.Integer 2); + (1, ImportCSV.DataType.Content "A"); + (2, ImportCSV.DataType.Integer 100); + ]; + [ + (0, ImportCSV.DataType.Integer 3); + (1, ImportCSV.DataType.Content "A"); + (2, ImportCSV.DataType.Integer 100); + ]; + ] + ~expected: + [ + [| ImportCSV.DataType.Integer 1; ImportCSV.DataType.Integer 100 |]; + [| ImportCSV.DataType.Integer 2; ImportCSV.DataType.Integer 200 |]; + [| ImportCSV.DataType.Integer 3; ImportCSV.DataType.Integer 300 |]; + ] + +let sum_total = + run_test "sum_total" + ~configuration: + {|version = 1 + +[source] +name = "source_name" +file = "source_file" + +[sheet] +columns = [ + ":A", + "sum(:C, [], [])", +]|} + ~input: + [ + [ + (0, ImportCSV.DataType.Integer 1); (2, ImportCSV.DataType.Integer 100); + ]; + [ + (0, ImportCSV.DataType.Integer 2); (2, ImportCSV.DataType.Integer 100); + ]; + ] + ~expected: + [ + [| ImportCSV.DataType.Integer 1; ImportCSV.DataType.Integer 200 |]; + [| ImportCSV.DataType.Integer 2; ImportCSV.DataType.Integer 200 |]; + ] + +(** Ensure the behavior of the sum function when no filter is given. It is + expected to get the total sum for each line *) +let sum_unfiltered = + run_test "sum_unfiltered" + ~configuration: + {|version = 1 + +[source] +name = "source_name" +file = "source_file" + +[sheet] +columns = [ + ":A", + "sum(:C, [], [:A])", +]|} + ~input: + [ + [ + (0, ImportCSV.DataType.Integer 1); (2, ImportCSV.DataType.Integer 100); + ]; + [ + (0, ImportCSV.DataType.Integer 2); (2, ImportCSV.DataType.Integer 100); + ]; + ] + ~expected: + [ + [| ImportCSV.DataType.Integer 1; ImportCSV.DataType.Integer 200 |]; + [| ImportCSV.DataType.Integer 2; ImportCSV.DataType.Integer 200 |]; + ] + +let test_suit = [ simple_extraction; sum_sort; sum_total; sum_unfiltered ] +let tests = "sql_db" >::: test_suit diff --git a/tests/sql_int.ml b/tests/sql_int.ml new file mode 100644 index 0000000..87b1086 --- /dev/null +++ b/tests/sql_int.ml @@ -0,0 +1,26 @@ +open OUnit2 + +let printer = function + | Sqlite3.Data.INT t -> Int64.to_string t + | Sqlite3.Data.NONE -> "None" + | Sqlite3.Data.NULL -> "Null" + | Sqlite3.Data.FLOAT f -> Float.to_string f + | Sqlite3.Data.TEXT t | Sqlite3.Data.BLOB t -> t + +let test_suit = + [ + ( "Int_of_int" >:: fun _ -> + assert_equal (Sqlite3.Data.INT 37354L) + (ImportSQL.Math.int (Sqlite3.Data.INT 37354L)) ); + ( "Int_of_string" >:: fun _ -> + assert_equal (Sqlite3.Data.INT 37354L) + (ImportSQL.Math.int (Sqlite3.Data.TEXT "37354")) ); + ( "Int_of_string2" >:: fun _ -> + assert_equal ~printer (Sqlite3.Data.INT 37354L) + (ImportSQL.Math.int (Sqlite3.Data.TEXT "37354.0")) ); + ( "Int_of_float" >:: fun _ -> + assert_equal (Sqlite3.Data.INT 37354L) + (ImportSQL.Math.int (Sqlite3.Data.FLOAT 37354.0)) ); + ] + +let tests = "sql_int" >::: test_suit diff --git a/tests/sql_match.ml b/tests/sql_match.ml new file mode 100644 index 0000000..0314bb3 --- /dev/null +++ b/tests/sql_match.ml @@ -0,0 +1,12 @@ +open OUnit2 + +let test_suit = + [ + ( "Parse regex" >:: fun _ -> + let text = Sqlite3.Data.TEXT "hello world" + and regex = Sqlite3.Data.TEXT "hello ([A-Za-z]+)" in + + assert_equal (Sqlite3.Data.TEXT "world") (ImportSQL.Match.f regex text) ); + ] + +let tests = "sql_match" >::: test_suit diff --git a/tests/sql_trim.ml b/tests/sql_trim.ml new file mode 100644 index 0000000..77e755e --- /dev/null +++ b/tests/sql_trim.ml @@ -0,0 +1,11 @@ +open OUnit2 + +let test_suit = + [ + ( "Trim" >:: fun _ -> + let text = Sqlite3.Data.TEXT " \nABC \n" in + + assert_equal (Sqlite3.Data.TEXT "ABC") (ImportSQL.Trim.f text) ); + ] + +let tests = "sql_trim" >::: test_suit |