From 6b377719c10d5ab3343fd5221f99a4a21008e25a Mon Sep 17 00:00:00 2001 From: Sébastien Dailly Date: Thu, 14 Mar 2024 08:26:58 +0100 Subject: Initial commit --- lib/analysers/dependency.ml | 256 +++++++++++++++ lib/analysers/dependency.mli | 40 +++ lib/analysers/dune | 12 + lib/analysers/headers.ml | 55 ++++ lib/analysers/headers.mli | 11 + lib/analysers/query.ml | 445 +++++++++++++++++++++++++++ lib/analysers/query.mli | 27 ++ lib/configuration/dune | 29 ++ lib/configuration/expression_lexer.mll | 91 ++++++ lib/configuration/expression_parser.messages | 123 ++++++++ lib/configuration/expression_parser.mly | 185 +++++++++++ lib/configuration/importConf.ml | 90 ++++++ lib/configuration/importConf.mli | 23 ++ lib/configuration/of_json.ml | 134 ++++++++ lib/configuration/read_conf.ml | 216 +++++++++++++ lib/configuration/syntax.ml | 88 ++++++ lib/containers/dune | 7 + lib/containers/importContainers.ml | 61 ++++ lib/csv/csv.ml | 30 ++ lib/csv/dataType.ml | 21 ++ lib/csv/dataType.mli | 8 + lib/csv/dune | 6 + lib/data_types/dune | 10 + lib/data_types/path.ml | 15 + lib/data_types/readme.rst | 4 + lib/data_types/table.ml | 19 ++ lib/data_types/types.ml | 15 + lib/errors/dune | 9 + lib/errors/importErrors.ml | 98 ++++++ lib/errors/importErrors.mli | 46 +++ lib/expression/ast.ml | 31 ++ lib/expression/compose.ml | 150 +++++++++ lib/expression/compose.mli | 59 ++++ lib/expression/dune | 9 + lib/expression/filters.ml | 193 ++++++++++++ lib/expression/filters.mli | 9 + lib/expression/headers.ml | 89 ++++++ lib/expression/headers.mli | 7 + lib/expression/lazier.ml | 71 +++++ lib/expression/query.ml | 335 ++++++++++++++++++++ lib/expression/query.mli | 27 ++ lib/expression/repr.ml | 127 ++++++++ lib/expression/repr.mli | 6 + lib/expression/sym.ml | 71 +++++ lib/expression/t.ml | 153 +++++++++ lib/expression/t.mli | 54 ++++ lib/expression/type_of.ml | 150 +++++++++ lib/expression/type_of.mli | 10 + lib/file_handler/csv2sql.ml | 135 ++++++++ lib/file_handler/csv2sql.mli | 10 + lib/file_handler/dune | 21 ++ lib/file_handler/state.ml | 178 +++++++++++ lib/file_handler/state.mli | 46 +++ lib/file_handler/xlsx2sql.ml | 205 ++++++++++++ lib/file_handler/xlsx2sql.mli | 10 + lib/helpers/console.ml | 16 + lib/helpers/console.mli | 5 + lib/helpers/dune | 8 + lib/helpers/helpers.ml | 45 +++ lib/helpers/toml.ml | 31 ++ lib/helpers/toml.mli | 1 + lib/sql/date.ml | 24 ++ lib/sql/db.ml | 383 +++++++++++++++++++++++ lib/sql/db.mli | 106 +++++++ lib/sql/dune | 15 + lib/sql/hashs.ml | 79 +++++ lib/sql/header.ml | 74 +++++ lib/sql/join.ml | 30 ++ lib/sql/match.ml | 22 ++ lib/sql/math.ml | 20 ++ lib/sql/t.ml | 52 ++++ lib/sql/trim.ml | 9 + lib/sql/year.ml | 19 ++ lib/tools/dune | 10 + lib/tools/git_head.sh | 11 + 75 files changed, 5290 insertions(+) create mode 100644 lib/analysers/dependency.ml create mode 100644 lib/analysers/dependency.mli create mode 100755 lib/analysers/dune create mode 100644 lib/analysers/headers.ml create mode 100644 lib/analysers/headers.mli create mode 100644 lib/analysers/query.ml create mode 100644 lib/analysers/query.mli create mode 100755 lib/configuration/dune create mode 100644 lib/configuration/expression_lexer.mll create mode 100644 lib/configuration/expression_parser.messages create mode 100644 lib/configuration/expression_parser.mly create mode 100644 lib/configuration/importConf.ml create mode 100644 lib/configuration/importConf.mli create mode 100644 lib/configuration/of_json.ml create mode 100644 lib/configuration/read_conf.ml create mode 100644 lib/configuration/syntax.ml create mode 100755 lib/containers/dune create mode 100644 lib/containers/importContainers.ml create mode 100644 lib/csv/csv.ml create mode 100644 lib/csv/dataType.ml create mode 100644 lib/csv/dataType.mli create mode 100755 lib/csv/dune create mode 100644 lib/data_types/dune create mode 100644 lib/data_types/path.ml create mode 100644 lib/data_types/readme.rst create mode 100644 lib/data_types/table.ml create mode 100644 lib/data_types/types.ml create mode 100644 lib/errors/dune create mode 100644 lib/errors/importErrors.ml create mode 100644 lib/errors/importErrors.mli create mode 100644 lib/expression/ast.ml create mode 100644 lib/expression/compose.ml create mode 100644 lib/expression/compose.mli create mode 100755 lib/expression/dune create mode 100644 lib/expression/filters.ml create mode 100644 lib/expression/filters.mli create mode 100644 lib/expression/headers.ml create mode 100644 lib/expression/headers.mli create mode 100644 lib/expression/lazier.ml create mode 100644 lib/expression/query.ml create mode 100644 lib/expression/query.mli create mode 100644 lib/expression/repr.ml create mode 100644 lib/expression/repr.mli create mode 100644 lib/expression/sym.ml create mode 100644 lib/expression/t.ml create mode 100644 lib/expression/t.mli create mode 100644 lib/expression/type_of.ml create mode 100644 lib/expression/type_of.mli create mode 100644 lib/file_handler/csv2sql.ml create mode 100644 lib/file_handler/csv2sql.mli create mode 100755 lib/file_handler/dune create mode 100644 lib/file_handler/state.ml create mode 100644 lib/file_handler/state.mli create mode 100644 lib/file_handler/xlsx2sql.ml create mode 100644 lib/file_handler/xlsx2sql.mli create mode 100644 lib/helpers/console.ml create mode 100644 lib/helpers/console.mli create mode 100755 lib/helpers/dune create mode 100755 lib/helpers/helpers.ml create mode 100644 lib/helpers/toml.ml create mode 100644 lib/helpers/toml.mli create mode 100644 lib/sql/date.ml create mode 100644 lib/sql/db.ml create mode 100644 lib/sql/db.mli create mode 100644 lib/sql/dune create mode 100644 lib/sql/hashs.ml create mode 100644 lib/sql/header.ml create mode 100644 lib/sql/join.ml create mode 100644 lib/sql/match.ml create mode 100644 lib/sql/math.ml create mode 100644 lib/sql/t.ml create mode 100644 lib/sql/trim.ml create mode 100644 lib/sql/year.ml create mode 100644 lib/tools/dune create mode 100755 lib/tools/git_head.sh (limited to 'lib') diff --git a/lib/analysers/dependency.ml b/lib/analysers/dependency.ml new file mode 100644 index 0000000..e81cc49 --- /dev/null +++ b/lib/analysers/dependency.ml @@ -0,0 +1,256 @@ +open StdLabels +module IntSet = ImportContainers.IntSet +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module Expression = ImportExpression.T + +(* + Internally, the dependency chain is represented as a graph. + + Each file to process (csv or xlsx) is marked as a node, The link between + each node cannot be represented in the graph because each index can imply + multiple tables together (for exemple [join(source.A, other.B)] ). So the + graph is only used in order to get the import process order, and each + information is keeped in a separate table. + +*) + +type deps = (ImportContainers.Source.t * ImportContainers.Source.t list) list + +type key = { + name : string; + expression : Path.column Expression.t; + columns : ImportContainers.IntSet.t Lazy.t; +} + +type t = { + table : Table.t; + columns : IntSet.t; + keys : key list; +} + +let table t = t.table +let columns t = t.columns +let keys t = t.keys + +type build_map = t ImportContainers.Externals.t + +(* The expression can be qualified with full path (when we are in the column + definition) or only with a column (in a binding with an externanl table). + + This type is here to extract the correct information, according to the type + we are dealing with : + + - [to_mapping] allow to pick the right Set in which we need to add the + column pointed by the path (this can be [keys] or [columns] + - [of_path] is need to extract the qualified source from any kind of path. +*) +type 'a expression_extractor = { + to_mapping : t -> Path.column -> t; + of_path : 'a -> string option * Path.column; +} + +(** [add_path_in_map f parent path ] Extract the column from element [path] and + process the column in the function [f] + + The [path] is abstract, but the function [f.of_path] can extract the needed + elements in order to add it in the mapping. + + The function may raise [Unknown_source] if the the path describe an unknown + table. *) +let add_path_in_map : + f:'a expression_extractor -> conf:Syntax.t -> 'a -> build_map -> build_map = + fun ~f ~conf path map -> + let table_source, column = f.of_path path in + let table = + try ImportConf.get_table_for_name conf table_source with + | Not_found -> raise (ImportErrors.Unknown_source (Option.get table_source)) + in + + ImportContainers.Externals.update map + ~key:(ImportContainers.KeyName.from_table table) ~f:(fun v -> + let mapping = + match v with + | None -> raise (ImportErrors.Unknown_source table.name) + | Some mapping -> mapping + in + + Some (f.to_mapping mapping column)) + +let add_expression_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t -> + build_map -> + build_map = + fun ~f ~conf expr map -> + Expression.fold_values expr ~init:map ~f:(fun map p -> + add_path_in_map ~f ~conf p map) + +let add_columns_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t list -> + build_map -> + build_map = + fun ~f ~conf columns map -> + let columns = + List.fold_left columns ~init:map ~f:(fun map expression -> + let new_map = add_expression_in_map ~f ~conf expression map in + new_map) + in + columns + +(* [add_dependancies ~conf source map path] + add the dependancy from the table [source] to another one after analysing the + expression and extracting the path contained inside. + + This function is called for each path declared inside the expression. *) +let add_dependancies : conf:Syntax.t -> Syntax.extern -> deps -> Path.t -> deps + = + fun ~conf extern graph path -> + let source_table = ImportConf.get_table_for_name conf path.Path.alias in + + let source = ImportContainers.Source.from_table source_table in + let target = ImportContainers.Source.from_table extern.target in + + match ImportContainers.Source.equal target source with + | true -> graph + | _ -> (target, [ source ]) :: graph + +let add_external_in_map : + conf:Syntax.t -> Syntax.extern -> build_map * deps -> build_map * deps = + fun ~conf extern (map, graph) -> + let dest = ImportContainers.KeyName.from_table extern.target in + (* Pre-check that every source is already declared in the configuration. *) + let _ = + Expression.fold_values extern.intern_key ~init:() ~f:(fun () path -> + try + let _ = ImportConf.get_table_for_name conf path.Path.alias in + () + with + | Not_found -> ( + match path.alias with + | Some table -> raise (ImportErrors.Unknown_source table) + | None -> + (* This is very unlikely. This error would be raised if we have + no source for this import *) + let root = conf.source in + raise (ImportErrors.Unknown_source root.name))) + in + + (* Create the new key with all the expression and all the columns inside it *) + let new_key = + { + name = extern.target.Table.name; + expression = extern.extern_key; + columns = + lazy + (Expression.fold_values extern.extern_key + ~f:(fun acc k -> ImportContainers.IntSet.add k acc) + ~init:ImportContainers.IntSet.empty); + } + in + let build_map = + ImportContainers.Externals.update map ~key:dest ~f:(function + | None -> + (* Create the entry for the external if it does not exists *) + Some + { + table = extern.target; + columns = IntSet.empty; + keys = [ new_key ]; + } + | Some mapping -> + (* Or update the existing one with the key we just created *) + Some { mapping with keys = new_key :: mapping.keys }) + in + let graph = + Expression.fold_values extern.intern_key ~init:graph + ~f:(add_dependancies ~conf extern) + in + let build_map = + add_expression_in_map extern.intern_key build_map ~conf + ~f: + { + of_path = + (fun Path.{ alias; column } -> + let table = ImportConf.get_table_for_name conf alias in + (Some table.name, column)); + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + } + in + (build_map, graph) + +let mapper = + { + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + of_path = (fun ({ alias; column } : Path.t) -> (alias, column)); + } + +let get_mapping : Syntax.t -> build_map * deps = + fun conf -> + let root = ImportContainers.Source.from_table (ImportConf.root_table conf) + and root' = + ImportContainers.KeyName.from_table (ImportConf.root_table conf) + in + let graph = [ (root, []) ] in + + (* For each external declared in the configuration file, add the columns to + query *) + let init = + ( ImportContainers.Externals.singleton root' + { + table = ImportConf.root_table conf; + columns = IntSet.empty; + keys = []; + }, + graph ) + in + let map, graph = + List.fold_left conf.externals ~init ~f:(fun map extern -> + add_external_in_map ~conf extern map) + in + + (* Now we don’t bother anymore with the graph and it’s dependency, we just + collect all the columns in the differents expressions *) + let map = + map + |> add_columns_in_map ~conf ~f:mapper conf.columns + |> add_columns_in_map ~conf ~f:mapper conf.sort + |> add_columns_in_map ~conf ~f:mapper conf.filters + |> add_columns_in_map ~conf ~f:mapper conf.uniq + in + (map, graph) + +let get_process_order : Syntax.t -> t list = + fun map -> + let map, graph = get_mapping map in + + match Tsort.sort graph with + | Tsort.ErrorCycle l -> + let name_of_key k = + ImportContainers.(Externals.find (Source.name k) map).table.name + in + raise (ImportErrors.Cycle (List.map ~f:name_of_key l)) + | Sorted elements -> + (* It’s OK, we know there is no cycles now, we can extract the files to + load from this list. + *) + List.filter_map elements ~f:(fun v -> + ImportContainers.Externals.find_opt + (ImportContainers.Source.name v) + map) + (* This list can still have duplicated values, and we have to remove them + still keeping the order. + *) + |> List.fold_left ~init:[] ~f:(fun acc element -> + (* Prevent the same file to beeing loaded twice *) + match List.mem element ~set:acc with + | true -> acc + | false -> element :: acc) diff --git a/lib/analysers/dependency.mli b/lib/analysers/dependency.mli new file mode 100644 index 0000000..c89522a --- /dev/null +++ b/lib/analysers/dependency.mli @@ -0,0 +1,40 @@ +type t + +val get_process_order : ImportConf.Syntax.t -> t list +(** Extract the file list to process, following the identified dependancies. + Try to load first the document which does not required another spreadsheet, + and keep going in the topological order + + Raise [Unknown_source file] if a source is referenced but is not declared. + The order matter : the exception will be raised in a source is referenced + before the declaration (when chaining differents externals) *) + +val table : t -> ImportDataTypes.Table.t +(** Get the table to proceed. *) + +val columns : t -> ImportContainers.IntSet.t +(** A set of columns loaded in this table. Thoses columns may not need + reported in the final export. + + Each column is identified by is index in the Excel document. + + This set does not include the columns used in the keys. They can be + fetched with the [keys] function *) + +type key = { + name : string; (** This is the name of the target table we are pointed to *) + expression : ImportDataTypes.Path.column ImportExpression.T.t; + (** The expression used as key *) + columns : ImportContainers.IntSet.t Lazy.t; + (** The list of columns used in the key. All the columns are referenced + in the expression. We can have many columns used inside a single + key when a function is used (for example for joining multiple + columns into a single key) *) +} +(** This type describe the join key in a table. The name is the refering table + using this key (the key name in the datable is key_"name" ), and the + expression describe how to build the key. *) + +val keys : t -> key list +(** [keys] is the list of columns pointed by another one. They are + considered as join key between the diffrent tables. *) diff --git a/lib/analysers/dune b/lib/analysers/dune new file mode 100755 index 0000000..1bbc30f --- /dev/null +++ b/lib/analysers/dune @@ -0,0 +1,12 @@ +(library + (name importAnalyser) + (libraries + importConf + importContainers + importCSV + importDataTypes + importExpression + importErrors + tsort + ) +) diff --git a/lib/analysers/headers.ml b/lib/analysers/headers.ml new file mode 100644 index 0000000..916dfee --- /dev/null +++ b/lib/analysers/headers.ml @@ -0,0 +1,55 @@ +open StdLabels +module I = ImportConf +module E = ImportExpression.T +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +module SheeetMap = Map.Make (struct + type t = Table.t + + (** We are sure we can’t have the same name for two different table. *) + let compare v1 v2 = String.compare (Table.name v1) (Table.name v2) +end) + +type content = string array + +type t = content SheeetMap.t +(** The map associate a line of headers for each table. + + The header are always in string. *) + +(** Get the headers. The function has to be called after reading each document, + and will reformat the first line with the values from the cell. The + functions will not be evaluated (instead they will be displayed "as is". + + When there is no value for this path, return empty string. + *) +let columns : Syntax.t -> t -> string list = + fun conf t -> + (* We build here a custom printer which search in the array for the column + name. + + This function will be given as argument in the expression printer. *) + let f : Path.t -> Buffer.t -> unit = + fun path b -> + let source = I.get_table_for_name conf path.alias in + + match SheeetMap.find_opt source t with + | None -> () + | Some arr -> ( + try Buffer.add_string b (Array.get arr (path.column - 1)) with + | _ -> + prerr_endline + @@ Printf.sprintf "No header found for :%s.%s" + (Option.value ~default:(I.root_table conf).Table.name + path.alias) + (ImportCSV.Csv.column_to_string path.column)) + in + + List.map conf.Syntax.columns ~f:(fun column -> + let b = Buffer.create 4 in + + ImportExpression.Headers.headers_of_expression b f column; + + Buffer.contents b) diff --git a/lib/analysers/headers.mli b/lib/analysers/headers.mli new file mode 100644 index 0000000..03e384b --- /dev/null +++ b/lib/analysers/headers.mli @@ -0,0 +1,11 @@ +module SheeetMap : Map.S with type key = ImportDataTypes.Table.t + +type t = string array SheeetMap.t + +val columns : ImportConf.Syntax.t -> t -> string list +(** Get the headers. The function has to be called after reading each document, + and will reformat the first line with the values from the cell. The + functions will not be evaluated (instead they will be displayed "as is". + + When there is no value for this path, return empty string. + *) diff --git a/lib/analysers/query.ml b/lib/analysers/query.ml new file mode 100644 index 0000000..7a6dd2a --- /dev/null +++ b/lib/analysers/query.ml @@ -0,0 +1,445 @@ +open StdLabels +module Expression = ImportExpression +module Q = Expression.Query +module Syntax = ImportConf.Syntax +module Table = ImportConf.Table +module Path = ImportConf.Path + +let truncate buffer n = Buffer.truncate buffer (Buffer.length buffer - n) + +(** The module allow to create fragment in the query which keep together the + binderd parameters and the text of the query.contents. + + This is used a lot in order to create the CTE, where you need the create + fragment used both in the main request and partially in the CTE itself. + + The content is mutable and all the functions are returning [unit]. *) +module Chunk = struct + type t = { + b : Buffer.t; + parameters : ImportCSV.DataType.t Queue.t; + } + + let create : unit -> t = + fun () -> { b = Buffer.create 16; parameters = Queue.create () } + + let create' : Buffer.t -> ImportCSV.DataType.t Queue.t -> t = + fun b parameters -> { b; parameters } + + (* Append the element from [tail] at the end of [head] + + Tail is destroyed during the operation. + *) + let append : head:t -> tail:t -> unit = + fun ~head ~tail -> + match Buffer.length tail.b with + | 0 -> () + | _ -> + Buffer.add_buffer head.b tail.b; + Queue.transfer tail.parameters head.parameters; + () + + (** Add a litteral string in the sequence *) + let add_string : t -> string -> unit = fun t v -> Buffer.add_string t.b v + + let copy : t -> t = + fun t -> + let b = Buffer.create 16 and parameters = Queue.copy t.parameters in + Buffer.add_buffer b t.b; + { b; parameters } + + let add_parameters : t -> ImportCSV.DataType.t Seq.t -> unit = + fun t p -> Queue.add_seq t.parameters p +end + +let prepare_key : f:(Format.formatter -> unit) -> Format.formatter -> unit = + fun ~f formatter -> Format.fprintf formatter "rtrim(upper(%t))" f + +(* Collect all the tables pointed by the expression. *) +let pointed_tables : Syntax.t -> 'a Expression.T.t -> (Table.t * string) list = + fun conf expression -> + Expression.T.fold_values expression ~init:[] ~f:(fun acc path -> + let table = ImportConf.get_table_for_name conf path.Path.alias in + let table_name = Table.name table in + (table, table_name) :: acc) + |> List.sort_uniq ~cmp:Stdlib.compare + +(** Represent a column in a safe way in a query *) +let print_column : Table.t -> string -> string = + fun table column -> + String.concat ~sep:"" [ "'"; table.Table.name; "'.'"; column; "'" ] + +let create_table : Dependency.t -> string = + fun mapping -> + let b = Buffer.create 64 in + + Buffer.add_string b "CREATE TABLE '"; + Buffer.add_string b (Table.name (Dependency.table mapping)); + Buffer.add_string b "' (id INTEGER PRIMARY KEY"; + + List.iter (Dependency.keys mapping) ~f:(fun { Dependency.name; _ } -> + Buffer.add_string b ",'key_"; + Buffer.add_string b name; + Buffer.add_string b "'"); + + ImportContainers.IntSet.iter (Dependency.columns mapping) ~f:(fun i -> + Buffer.add_string b ",'col_"; + Buffer.add_string b (string_of_int i); + Buffer.add_string b "'"); + Buffer.add_string b ")"; + + Buffer.contents b + +let show_path : conf:Syntax.t -> Format.formatter -> Path.t -> unit = + fun ~conf buffer { alias; column } -> + let table = ImportConf.get_table_for_name conf alias in + let table_name = table.Table.name in + Format.fprintf buffer "'%s'.col_%d" table_name column + +(** Extract the informations from the dependancies. We get two informations here : + + - the join query in order to load the data from the external column + - the column corresponding to the key in order to identify the missing + links later. + *) +let query_of_external : + conf:Syntax.t -> join_buffer:Chunk.t -> Syntax.extern -> unit = + fun ~conf ~join_buffer external_ -> + let extern_table = Table.name external_.target in + + let formatter = Format.formatter_of_buffer join_buffer.b in + Format.fprintf formatter "\nLEFT JOIN '%s' AS '%s' ON %t = %s" extern_table + external_.target.name + (prepare_key ~f:(fun f -> + let q = + Q.query_of_expression Q.BindParam f (show_path ~conf) + external_.intern_key + in + + Chunk.add_parameters join_buffer (Queue.to_seq q))) + (print_column external_.Syntax.target + ("key_" ^ external_.Syntax.target.name)); + + Format.pp_print_flush formatter () + +(** Create the from part of the query, adding all the reuired externals. + + SQLite is able to optimize the query and do not load the table not used in + the select clause. *) +let create_from_chunck : Syntax.t -> Chunk.t -> unit = + fun conf c -> + Chunk.add_string c "\nFROM '"; + Chunk.add_string c (Table.name conf.source); + Chunk.add_string c "' AS '"; + Chunk.add_string c conf.source.name; + Chunk.add_string c "'"; + + (* Add the externals in the query *) + List.iter conf.externals ~f:(query_of_external ~conf ~join_buffer:c) + +(** Build a CTE query in order to use any group function inside the query. + Return the binded parameters used in the expression. The buffer given in + argument is also modified during the construction. + + If filters is not None, the clauses are added to the CTE. *) +let build_cte : + Syntax.t -> + expression:'a Expression.T.t -> + filters:Chunk.t option -> + Chunk.t = + fun conf ~expression ~filters -> + (* The binded parameters queue will be used later in the full query *) + let cte_chunk = Chunk.create () in + + Chunk.add_string cte_chunk "WITH cte AS (SELECT "; + Chunk.add_string cte_chunk conf.source.name; + Chunk.add_string cte_chunk ".id, "; + + let formatter = Format.formatter_of_buffer cte_chunk.b in + + let p = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) expression + in + Format.pp_print_flush formatter (); + Chunk.add_parameters cte_chunk (Queue.to_seq p); + (* The name is hardcoded here, and used in [Expression.Filters.window] *) + Chunk.add_string cte_chunk " AS group0"; + + let () = create_from_chunck conf cte_chunk in + let () = + match filters with + | None -> () + | Some filters_chunk -> + Chunk.append ~head:cte_chunk ~tail:(Chunk.copy filters_chunk) + in + + Chunk.add_string cte_chunk ")\n"; + cte_chunk + +type filter_evaluation = { + content : Buffer.t; + parameters : ImportCSV.DataType.t Seq.t; + cte : (string * Chunk.t) option; +} +(** Build the filters to apply in the query. We make the difference here + between the predicates to apply directly in the query, and the filters + associated with a group, which are required to be transformed into a CTE +in SQL, and are evaluated before. *) + +(** Evaluate the filters on the query *) +let eval_filters : Syntax.t -> filter_evaluation = + fun conf -> + match conf.filters with + | [] -> + let empty_buffer = Buffer.create 0 in + { content = empty_buffer; parameters = Seq.empty; cte = None } + | filters -> ( + (* Create a new queue in order to accumulate all the parameters to bind. + This filter will be given to both the CTE if any, or reused in the + main query when there is no CTE. + *) + let chunk_filters = Chunk.create () in + Chunk.add_string chunk_filters "\nWHERE "; + + let group = Chunk.create () in + + let with_cte, with_exr = + List.fold_left filters ~init:(None, false) + ~f:(fun (with_cte, with_exr) column -> + (* The function will return an option in second position which is + None when no Group function where found, and Some Expression + otherwise *) + let b = Buffer.create 16 in + + let formatter = Format.formatter_of_buffer b in + let queue, group_found = + Expression.Filters.query_of_expression Q.BindParam formatter + (show_path ~conf) column + in + Format.pp_print_flush formatter (); + let clause = Chunk.create' b queue in + + match (group_found, with_cte) with + | None, _ -> + Chunk.append ~head:chunk_filters ~tail:clause; + Chunk.add_string chunk_filters "\nAND "; + (with_cte, true) + | (Some _ as group'), None -> + (* We have a group here, we do not add it into the + filter_buffer right now. + + This can occur only once, the second one will raise + an error. *) + Chunk.append ~head:group ~tail:clause; + (group', with_exr) + | Some _, Some _ -> raise ImportErrors.MisplacedWindow) + in + + match with_cte with + | None -> + let content = chunk_filters.b in + truncate content 5; + { + (* There is no group clause in the query *) + content; + parameters = Queue.to_seq chunk_filters.parameters; + cte = None; + } + | Some expression -> + let filters = + if with_exr then ( + (* If we have additionnals filters from the group clause, we + have to report them in the CTE instead of the main query. *) + let c' = Chunk.copy chunk_filters in + truncate c'.b 5; + Some c') + else None + in + + (* Create the common expression table *) + let cte_parameters = build_cte conf ~expression ~filters in + Chunk.append ~head:chunk_filters ~tail:group; + + { + content = chunk_filters.b; + parameters = Queue.to_seq chunk_filters.parameters; + (* The name is hardcoded here, and used in + [Expression.Filters.window] *) + cte = Some ("cte", cte_parameters); + }) + +type query = { + q : string; + parameters : ImportCSV.DataType.t Seq.t; +} + +(** Build the query and return also the mapping in order to identify each + external links between files. + + The select query will name each column with an alias, and the map allow to + find which source is pointed by this alias. *) +let select : Syntax.t -> query * Path.t ImportExpression.T.t array = + fun conf -> + (* If the filters contains a group expression, we need to transform this into + a CTE, which have to be evaluated before the main query. That’s why we are + evaluating the filters right now.*) + let filters = eval_filters conf in + let b = Buffer.create 256 in + let parameters = Queue.create () in + + Option.iter + (fun (_, (cte : Chunk.t)) -> + Buffer.add_buffer b cte.b; + Queue.add_seq parameters (Queue.to_seq cte.parameters)) + filters.cte; + + (* For each column in the configuration file, add the corresponding element + in the query. + + The Sqlite driver return the elements in an array, we create an array to + in order to manage the elements together. + *) + let headers = Array.make (List.length conf.columns) (Obj.magic None) in + + let columns = List.to_seq conf.columns |> Seq.mapi (fun i c -> (i, c)) in + let formatter = Format.formatter_of_buffer b in + let () = + Format.fprintf formatter "SELECT %a" + (Format.pp_print_seq + ~pp_sep:(fun f () -> Format.fprintf f ",\n") + (fun formatter (i, column) -> + Array.set headers i column; + let p = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer p parameters; + Format.fprintf formatter " AS result_%d" i)) + columns + in + Format.pp_print_flush formatter (); + + let () = create_from_chunck conf (Chunk.create' b parameters) in + + (* If the query has a CTE, link it as well. We use an INNER JOIN here because + we want to be sure to get all the rows fetched by the CTE + *) + let () = + match filters.cte with + | None -> () + | Some (name, _) -> + Buffer.add_string b "\nINNER JOIN '"; + Buffer.add_string b name; + Buffer.add_string b "' ON "; + Buffer.add_string b name; + Buffer.add_string b ".id = "; + Buffer.add_string b conf.source.name; + Buffer.add_string b ".id" + in + + Buffer.add_buffer b filters.content; + Queue.add_seq parameters filters.parameters; + + let formatter = Format.formatter_of_buffer b in + (match conf.Syntax.uniq with + | [] -> () + | uniq -> + Format.fprintf formatter "\nGROUP BY %a" + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f ", ") + (fun formatter column -> + let seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer seq parameters)) + uniq); + (match conf.Syntax.sort with + | [] -> () + | sort -> + Format.fprintf formatter "\nORDER BY %a" + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f ", ") + (fun formatter column -> + let seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + column + in + Queue.transfer seq parameters)) + sort); + Format.pp_print_flush formatter (); + + ({ q = Buffer.contents b; parameters = Queue.to_seq parameters }, headers) + +let check_external : Syntax.t -> Syntax.extern -> query = + fun conf external_ -> + let extern_table = Table.name external_.target in + + let parameters = Queue.create () in + let internal_key_buffer = Buffer.create 16 in + let formatter = Format.formatter_of_buffer internal_key_buffer in + let internal_key_seq = + Q.query_of_expression Q.BindParam formatter (show_path ~conf) + external_.intern_key + in + Format.pp_print_flush formatter (); + + let external_key_buffer = Buffer.create 16 in + let pointed_tables = pointed_tables conf external_.intern_key in + Buffer.add_string external_key_buffer + (print_column external_.Syntax.target + ("key_" ^ external_.Syntax.target.name)); + + (* We do a copy before the transfert because the Queue is reused later in the + query *) + Queue.transfer (Queue.copy internal_key_seq) parameters; + + let join_content = Buffer.contents external_key_buffer in + let inner_content = Buffer.contents internal_key_buffer in + let b = Buffer.create 256 in + let formatter = Format.formatter_of_buffer b in + let () = + Format.fprintf formatter + "SELECT %a%s FROM%a LEFT JOIN '%s' AS '%s' ON %t = %s WHERE %s IS NULL \ + AND %s IS NOT NULL AND %s <> ''" + (fun formatter -> function + | [ (table, _name) ] -> + Format.fprintf formatter "%s, " (print_column table "id") + | _ -> Format.fprintf formatter "-1, ") + pointed_tables (* *) + inner_content (* *) + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.pp_print_text f ", ") + (fun formatter (table, name) -> + Format.fprintf formatter "\n'%s' AS '%s'" name table.Table.name)) + pointed_tables (* *) + extern_table (* *) + external_.target.name + (prepare_key ~f:(fun b -> + Format.pp_print_text b (Buffer.contents internal_key_buffer))) + join_content (* *) + join_content (* *) + inner_content (* *) + inner_content + in + + Format.pp_print_flush formatter (); + + { q = Buffer.contents b; parameters = Queue.to_seq parameters } + +let build_key_insert : Buffer.t -> Dependency.key -> unit = + fun buffer { Dependency.expression; _ } -> + let show_column : Format.formatter -> Path.column -> unit = + fun formatter column -> Format.fprintf formatter ":col_%d" column + in + + let formatter = Format.formatter_of_buffer buffer in + + let () = + prepare_key formatter ~f:(fun formatter -> + Q.query_of_expression Q.NoParam formatter show_column expression) + in + + Format.pp_print_flush formatter (); + + () diff --git a/lib/analysers/query.mli b/lib/analysers/query.mli new file mode 100644 index 0000000..14d2807 --- /dev/null +++ b/lib/analysers/query.mli @@ -0,0 +1,27 @@ +module Syntax = ImportConf.Syntax + +val create_table : Dependency.t -> string + +type query = { + q : string; (** The query to execute *) + parameters : ImportCSV.DataType.t Seq.t; +} +(** This type represent a query to execute. + [q] is the template to run, and shall be run with all the binded parameters. + *) + +val select : + Syntax.t -> query * ImportDataTypes.Path.t ImportExpression.T.t array + +val check_external : Syntax.t -> Syntax.extern -> query +(** Create a query which select all the missing key in an external *) + +val build_key_insert : Buffer.t -> Dependency.key -> unit +(* Build the fragment insert query. + + This use prepare statement and apply the function at the insert time, in + order to have the key already in the expected format in the database. + + The column are name :col_XX where XX is the index of the column in the + datasheet (starting from 1) +*) diff --git a/lib/configuration/dune b/lib/configuration/dune new file mode 100755 index 0000000..27d31a6 --- /dev/null +++ b/lib/configuration/dune @@ -0,0 +1,29 @@ +(library + (name importConf) + (libraries + decoders + otoml + menhirLib + importCSV + yojson + re + helpers + importDataTypes + importExpression + importErrors + ) + +(preprocess (pps ppx_yojson_conv ppx_deriving.ord)) +) + +(rule + (targets expression_parser_messages.ml) + (deps expression_parser.messages expression_parser.mly) + (action (with-stdout-to %{targets} (run menhir --compile-errors %{deps})))) + +(menhir + (modules expression_parser) + (flags --table) +) + +(ocamllex expression_lexer) diff --git a/lib/configuration/expression_lexer.mll b/lib/configuration/expression_lexer.mll new file mode 100644 index 0000000..cbfc8dc --- /dev/null +++ b/lib/configuration/expression_lexer.mll @@ -0,0 +1,91 @@ +{ + open Expression_parser + module Expression = ImportExpression.T + + exception UnclosedQuote of { content: string ; line : int} +} + + +let spaces = [ ' ' '\t' ] +let letters = [^ '"' '\'' '(' ')' '[' ']' ':' '.' ',' '^' ' ' '\t' '\n' '\r' ] +let digit = [ '0'-'9' ] +let eol = [ '\r' '\n' ] + +let escaped = [ '\'' '\\'] + +rule token = parse +| eol { Lexing.new_line lexbuf; token lexbuf } +| spaces { token lexbuf } +| '\'' { + try read_quoted_string (Buffer.create 17) lexbuf + with Failure _ -> + let line = lexbuf.Lexing.lex_curr_p.pos_lnum + and content = Bytes.to_string lexbuf.Lexing.lex_buffer in + raise (UnclosedQuote {line; content}) +} +| '"' { read_dquoted_string (Buffer.create 17) lexbuf } +| '#' { skip_comment lexbuf } +| '(' { L_PAREN } +| ')' { R_PAREN } +| '[' { L_BRACKET } +| ']' { R_BRACKET } +| ':' { COLUMN } +| '.' { DOT } +| ',' { COMA } +| '^' { CONCAT_OPERATOR } +| '+' { BINARY_OPERATOR (Expression.Add) } +| '-' { BINARY_OPERATOR (Expression.Minus) } +| '/' { BINARY_OPERATOR (Expression.Division) } +| "and" { BOOL_OPERATOR (Expression.And) } +| "or" { BOOL_OPERATOR (Expression.Or) } +| '<' { INEQUALITY_OPERATOR (Expression.LT) } +| '>' { INEQUALITY_OPERATOR (Expression.GT) } +| "<>" { EQUALITY_OPERATOR (Expression.Different) } +| '=' { EQUALITY_OPERATOR (Expression.Equal) } +| digit+ as l { INTEGER l} +| '-' digit+ as l { INTEGER l} +| letters+ as l { IDENT l} +| eof { EOF } + +and skip_comment = parse + | [^ '\r' '\n' ] + { skip_comment lexbuf } + | eol + { token lexbuf } + +(* Read the content until we got another one quote *) +and read_quoted_string buf = parse + | [^ '\'' '\\' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_quoted_string buf lexbuf + } + | "\\\'" + { Buffer.add_char buf '\''; + read_quoted_string buf lexbuf + } + | '\\' + { Buffer.add_char buf '\\'; + read_quoted_string buf lexbuf + } + | '\'' + { LITERAL (Buffer.contents buf) + } + +(* Read the content until we got another one quote *) +and read_dquoted_string buf = parse + | [^ '"' '\\' ]+ + { Buffer.add_string buf (Lexing.lexeme lexbuf); + read_dquoted_string buf lexbuf + } + | "\\\"" + { Buffer.add_char buf '"'; + read_dquoted_string buf lexbuf + } + | '\\' + { Buffer.add_char buf '\\'; + read_dquoted_string buf lexbuf + } + | '"' + { + LITERAL (Buffer.contents buf) + } diff --git a/lib/configuration/expression_parser.messages b/lib/configuration/expression_parser.messages new file mode 100644 index 0000000..ff7e757 --- /dev/null +++ b/lib/configuration/expression_parser.messages @@ -0,0 +1,123 @@ +column_expr: R_PAREN +## + +Invalid expression + +path_expr: IDENT R_PAREN +column_expr: IDENT R_PAREN +column_expr: IDENT L_PAREN IDENT R_PAREN +path_expr: IDENT L_PAREN IDENT R_PAREN +## + +Misplaced function. Did you forgot to quote the text ? + +column_expr: IDENT L_PAREN EOF +path_expr: IDENT L_PAREN EOF +## + +Uncomplete expression + +column_expr: COLUMN R_PAREN +path_expr: COLUMN R_PAREN +## + +The path is missing. + +column_expr: LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL L_PAREN +column_expr: LITERAL CONCAT_OPERATOR LITERAL BINARY_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL BINARY_OPERATOR LITERAL L_PAREN +column_expr: LITERAL CONCAT_OPERATOR LITERAL CONCAT_OPERATOR LITERAL L_PAREN +path_expr: LITERAL CONCAT_OPERATOR LITERAL CONCAT_OPERATOR LITERAL L_PAREN +column_expr: IDENT L_PAREN L_PAREN LITERAL L_PAREN +path_expr: IDENT L_PAREN LITERAL L_PAREN +## + +A text is given where it was expected a function. + +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +column_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +path_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +path_expr: LITERAL BINARY_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA R_PAREN +## +## Ends in an error in state: 61. +## +## separated_nonempty_list(COMA,expr_(path_,COMA)) -> expr_(path_,COMA) COMA . separated_nonempty_list(COMA,expr_(path_,COMA)) [ R_PAREN ] +## +## The known suffix of the stack is as follows: +## expr_(path_,COMA) COMA +## + +Uncomplete expression + +column_expr: IDENT L_PAREN LITERAL COMA R_PAREN +path_expr: IDENT L_PAREN LITERAL COMA R_PAREN + +Misplaced coma + +column_expr: IDENT L_PAREN LITERAL EOF +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN EOF +path_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN EOF +path_expr: IDENT L_PAREN L_PAREN LITERAL EOF +## + +Uncomplete expression. Did you forgot a ')' ? + + +column_expr: LITERAL R_PAREN +path_expr: LITERAL BINARY_OPERATOR LITERAL R_PAREN +## + +Invalid expression + +path_expr: COLUMN IDENT L_PAREN +## +# Also apply to : +# path_expr: COLUMN IDENT COLUMN + +Misplaced path + +path_expr: COLUMN IDENT DOT R_PAREN +## + +Incomplete path: the table is missing + +column_expr: INTEGER BINARY_OPERATOR INTEGER R_PAREN +## + +Unbalanced parens. Did you wanted to write ')' instead of '(' ? + + +path_expr: IDENT L_PAREN L_BRACKET LITERAL R_PAREN + +Unbalanced brackets. Did you wanted to write ']' instead of ')' ? + +column_expr: IDENT L_PAREN LITERAL CONCAT_OPERATOR R_PAREN +path_expr: IDENT L_PAREN LITERAL CONCAT_OPERATOR R_PAREN +column_expr: LITERAL CONCAT_OPERATOR R_PAREN +path_expr: LITERAL CONCAT_OPERATOR R_PAREN +column_expr: IDENT L_PAREN LITERAL BINARY_OPERATOR R_PAREN +path_expr: IDENT L_PAREN LITERAL BINARY_OPERATOR R_PAREN +column_expr: LITERAL BINARY_OPERATOR R_PAREN +path_expr: LITERAL BINARY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR R_PAREN +column_expr: INTEGER INEQUALITY_OPERATOR R_PAREN +path_expr: INTEGER INEQUALITY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR INTEGER INEQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR INTEGER INEQUALITY_OPERATOR R_PAREN +column_expr: INTEGER EQUALITY_OPERATOR INTEGER EQUALITY_OPERATOR R_PAREN +path_expr: INTEGER EQUALITY_OPERATOR INTEGER EQUALITY_OPERATOR R_PAREN + +The operator expect two arguments. Only one is given + +column_expr: IDENT L_PAREN L_BRACKET R_PAREN +path_expr: IDENT L_PAREN L_BRACKET R_PAREN +column_expr: IDENT L_PAREN L_BRACKET LITERAL R_PAREN +column_expr: LITERAL CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET +path_expr: INTEGER CONCAT_OPERATOR IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET +path_expr: IDENT L_PAREN L_BRACKET R_BRACKET COMA L_BRACKET R_BRACKET R_BRACKET + +Mix between brackets and parens. + diff --git a/lib/configuration/expression_parser.mly b/lib/configuration/expression_parser.mly new file mode 100644 index 0000000..1304c4d --- /dev/null +++ b/lib/configuration/expression_parser.mly @@ -0,0 +1,185 @@ +%token IDENT +%token L_PAREN +%token R_PAREN +%token L_BRACKET R_BRACKET +%token COLUMN +%token DOT +%token LITERAL +%token INTEGER +%token COMA +%token EOF +%token CONCAT_OPERATOR + +%token BINARY_OPERATOR +%token INEQUALITY_OPERATOR +%token EQUALITY_OPERATOR +%token BOOL_OPERATOR + +%start path_expr +%start column_expr + +%right BOOL_OPERATOR +%right INEQUALITY_OPERATOR EQUALITY_OPERATOR +%right CONCAT_OPERATOR BINARY_OPERATOR + +%{ + + let function_of_name param f = + match (String.lowercase_ascii f, param) with + | "nvl", _ -> + ImportExpression.T.Nvl param + | "join", (ImportExpression.T.Literal sep:: tl) -> + ImportExpression.T.Join (sep, tl) + | "join", (ImportExpression.T.Empty:: tl) -> + ImportExpression.T.Join ("", tl) + | "upper", _ -> + ImportExpression.T.Function' (ImportExpression.T.Upper, param) + | "trim", _ -> + ImportExpression.T.Function' (ImportExpression.T.Trim, param) + | other, _ -> + ImportExpression.T.Function (other, param) + +%} + +%% + +path_expr: + | expr_(path_, EOF) EOF { $1 } + | EOF { ImportExpression.T.Empty } +column_expr: + | expr_(column_, EOF) EOF { $1 } + | EOF { ImportExpression.T.Empty } + + +path_: + | COLUMN + column = IDENT + { ImportExpression.T.Path + Syntax.Path.{ alias = None + ; column = ImportCSV.Csv.column_of_string column + } + } + + | COLUMN + table = IDENT + DOT + column = IDENT + { ImportExpression.T.Path + Syntax.Path.{ alias = Some table + ; column = ImportCSV.Csv.column_of_string column} + } + +column_: + | COLUMN + column = IDENT + { try ImportExpression.T.Path (ImportCSV.Csv.column_of_string column) + with _ -> ImportExpression.T.Literal column } + +arguments(PATH): + | L_PAREN + expr = separated_list(COMA, expr_(PATH, COMA)) + R_PAREN + { expr } + +group(PATH): + | L_BRACKET + expr = separated_list(COMA, expr_(PATH, COMA)) + R_BRACKET + { expr } + +fixed(PATH): + | d = INTEGER + { ImportExpression.T.Integer d } + | l = LITERAL + { + if String.equal String.empty l then + ImportExpression.T.Empty + else + ImportExpression.T.Literal l + } + +%inline boperator: + | e = BINARY_OPERATOR { e } + | e = INEQUALITY_OPERATOR { e } + | e = EQUALITY_OPERATOR { e } + | e = BOOL_OPERATOR { e } + +(* The expression evaluation receveive in parameters : + 1. the way to buidl a path, as we have two distinct ways to build them in + the case of externals (the external_key does not allow a table name) + 2. a phantom type telling wich kind of element will end the expression. + This can be EOF for the root expression, or COMA when inside a function. + This prevent merlin to optimize thoses two path, and allow more precise + error messages. *) +expr_(PATH, ENDING_PHANTOM): + | L_PAREN + e = expr_(PATH, R_PAREN) + R_PAREN + { ImportExpression.T.Expr e + } + | + p1 = expr_(PATH, ENDING_PHANTOM) + CONCAT_OPERATOR + p2 = expr_(PATH, COMA) + { match p2 with + | ImportExpression.T.Concat args -> ImportExpression.T.Concat (p1::args) + | _ -> ImportExpression.T.Concat (p1::p2::[]) + } + | p1 = expr_(PATH, ENDING_PHANTOM) + + op = boperator + p2 = expr_(PATH, COMA) + { ImportExpression.T.BOperator (op, p1, p2) } + + | p1 = expr_(PATH, ENDING_PHANTOM) + op = EQUALITY_OPERATOR + p2 = group(PATH) + { ImportExpression.T.GEquality(op, p1, p2) } + + + + | p = PATH + { p } + | f = fixed(PATH) + { f } + | s = IDENT + args = arguments(PATH) + { function_of_name args s } + | + s = IDENT + L_PAREN + opt_arg = opt_arg(PATH, COMA)? + args1 = group(PATH) + COMA + args2 = group(PATH) + R_PAREN + { let window_name = ImportExpression.T.window_of_name s opt_arg in + ImportExpression.T.Window (window_name, args1, args2) } +(* + | (* This case is here to describe a window function which has 2 arguments + level. + I’m not completely satisfied with it, as it prevent the ability to + create a exprpression block with parens arround. *) + s = IDENT + L_PAREN + opt_arg = opt_arg(PATH, COMA)? + args1 = arguments(PATH) + COMA + args2 = arguments(PATH) + R_PAREN + { let window_name = ImportExpression.T.window_of_name s opt_arg in + let expr = ImportExpression.T.Window (window_name, args1, args2) in + + let expr_repr = ImportExpression.Repr.repr ~top:true (fun _ -> "") + expr in + Printf.printf "Deprecated syntax in \"%s\" use [] instead of ()\n" expr_repr; + + + expr + } +*) + +opt_arg(PATH, SEP): + | expr = expr_(PATH, COMA) + SEP + { expr } diff --git a/lib/configuration/importConf.ml b/lib/configuration/importConf.ml new file mode 100644 index 0000000..586be3c --- /dev/null +++ b/lib/configuration/importConf.ml @@ -0,0 +1,90 @@ +open StdLabels +module Syntax = Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module T = Read_conf +module Expression = ImportExpression.T + +let current_syntax = 1 + +let t_of_yojson : Yojson.Safe.t -> Syntax.t = + fun json -> + let keys = Yojson.Safe.Util.keys json in + let version = + match List.find_opt keys ~f:(String.equal "version") with + | None -> + Printf.printf + "No version given. Your setup may break in the future.\n\ + Please add « \"version\":%d » in your configuration.\n\n" + current_syntax; + `Int 1 + | Some _ -> Yojson.Safe.Util.member "version" json + in + + match version with + | `Int 1 -> Of_json.t_of_yojson json + | other -> + Printf.eprintf "Unsuported version : %s\n" (Yojson.Safe.to_string other); + exit 1 + +module TomlReader = Read_conf.Make (Helpers.Toml.Decode) + +let t_of_toml : Otoml.t -> (Syntax.t, string) result = + fun toml -> + let version = + Otoml.find toml (Otoml.get_integer ~strict:false) [ "version" ] + in + match version with + | 1 -> TomlReader.read toml + | _ -> + Printf.eprintf "Unsuported version : %d\n" version; + exit 1 + +let dummy_conf = + Syntax. + { + source = { file = ""; tab = 0; name = "" }; + version = 1; + externals = []; + columns = []; + filters = []; + sort = []; + uniq = []; + } + +let get_table_for_name : Syntax.t -> string option -> Table.t = + fun conf name -> + match name with + | None -> conf.source + | Some name -> + if String.equal name conf.source.name then conf.source + else + let ext = + List.find conf.externals ~f:(fun (ext : Syntax.extern) -> + String.equal name ext.target.name) + in + ext.target + +let root_table : Syntax.t -> Table.t = fun conf -> conf.source + +let get_dependancies_for_table : Syntax.t -> Table.t -> Syntax.extern list = + fun conf source -> + let is_root = source = conf.source in + + List.filter conf.externals ~f:(fun (ext : Syntax.extern) -> + (* Enumerate the intern_key and check the source pointed by each column *) + Expression.fold_values ext.intern_key ~init:false ~f:(fun acc expr -> + if acc then acc + else + match expr.Syntax.Path.alias with + | Some v -> String.equal v source.name + | None -> is_root)) + +let print_path_expression t = ImportExpression.Repr.repr Path.repr t + +let print_extern t = + let toml = Syntax.toml_of_extern t in + Otoml.Printer.to_string toml + +let expression_from_string s = + Read_conf.ExpressionParser.of_string Read_conf.ExpressionParser.path s diff --git a/lib/configuration/importConf.mli b/lib/configuration/importConf.mli new file mode 100644 index 0000000..3a8ae75 --- /dev/null +++ b/lib/configuration/importConf.mli @@ -0,0 +1,23 @@ +module Syntax = Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +val dummy_conf : Syntax.t + +val root_table : Syntax.t -> Table.t +(** Get the root table, this table is the main table to load and each line in + this table will be processed *) + +val t_of_yojson : Yojson.Safe.t -> Syntax.t +val t_of_toml : Otoml.t -> (Syntax.t, string) result +val get_table_for_name : Syntax.t -> string option -> Table.t + +val get_dependancies_for_table : Syntax.t -> Table.t -> Syntax.extern list +(** Get all the externals refered by the source *) + +val print_path_expression : Path.t ImportExpression.T.t -> string + +val expression_from_string : + string -> (Path.t ImportExpression.T.t, string) result + +val print_extern : Syntax.extern -> string diff --git a/lib/configuration/of_json.ml b/lib/configuration/of_json.ml new file mode 100644 index 0000000..f9171b9 --- /dev/null +++ b/lib/configuration/of_json.ml @@ -0,0 +1,134 @@ +open StdLabels +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module Expression = ImportExpression.T + +open Ppx_yojson_conv_lib.Yojson_conv.Primitives + +let current_syntax = 1 + +let rec expression_of_yojson : + (Yojson.Safe.t -> 'a) -> Yojson.Safe.t -> 'a Expression.t = + fun f expr -> + match expr with + | `Null -> Empty + | `List l -> Concat (List.map ~f:(expression_of_yojson f) l) + | `String s as json -> ( + try Path (f json) with + | _ -> Literal s) + | `Assoc [ (fn, `List [ `List l1; `List l2 ]) ] + when String.equal "counter" (String.lowercase_ascii fn) -> + Window + ( Expression.Counter, + List.map ~f:(expression_of_yojson f) l1, + List.map ~f:(expression_of_yojson f) l2 ) + | `Assoc [ (fn, `List [ expr1; `List l2; `List l3 ]) ] + when String.equal "previous" (String.lowercase_ascii fn) -> + Window + ( Expression.Previous (expression_of_yojson f expr1), + List.map ~f:(expression_of_yojson f) l2, + List.map ~f:(expression_of_yojson f) l3 ) + | `Assoc [ (fn, `List l) ] when String.equal "nvl" (String.lowercase_ascii fn) + -> Nvl (List.map ~f:(expression_of_yojson f) l) + | `Assoc [ (fn, `List l) ] -> + Function + (String.lowercase_ascii fn, List.map ~f:(expression_of_yojson f) l) + | json -> ( + try Path (f json) with + | _ -> + let str_json = Yojson.Safe.pretty_to_string json in + raise + (ImportErrors.JsonError { json = str_json; element = "Expression" }) + ) + +type 'a expression = 'a Expression.t +type column = Path.column + +let column_of_yojson : Yojson.Safe.t -> int = function + | `Int i -> i + | `String s -> ImportCSV.Csv.column_of_string s + | _ -> raise (Invalid_argument "column") + +let yojson_of_column i = `String (ImportCSV.Csv.column_to_string i) + +type path = Syntax.Path.t = { + alias : string option; [@default None] [@yojson_drop_default ( = )] + (* External file to load, when the information is missing, load in + the current file *) + column : column; +} +[@@deriving of_yojson] + +let path_of_yojson : Yojson.Safe.t -> path = function + | `String s -> + Scanf.sscanf s ":%s@.%s" (fun table column -> + if String.equal column "" then + { alias = None; column = ImportCSV.Csv.column_of_string table } + else + { + alias = Some table; + column = ImportCSV.Csv.column_of_string column; + }) + | other -> path_of_yojson other + +let yojson_of_path : path -> Yojson.Safe.t = + fun { alias; column } -> + let table = + match alias with + | None -> "" + | Some table -> String.cat table "." + in + + `String + (String.concat ~sep:"" + [ ":"; table; ImportCSV.Csv.column_to_string column ]) + +type table = Table.t = { + file : string; + tab : int; [@default 1] [@yojson_drop_default ( = )] + name : string; +} +[@@deriving of_yojson] + +type extern = { + source : string option; [@default None] [@yojson_drop_default ( = )] + intern_key : column expression; + target : table; + extern_key : column expression; + allow_missing : bool; [@default false] [@yojson_drop_default ( = )] + match_rule : string option; [@default None] [@yojson_drop_default ( = )] +} +[@@deriving of_yojson] + +type syntax_v1_extern = Syntax.extern + +let syntax_v1_extern_of_yojson yojson = + let e = extern_of_yojson yojson in + let intern_key : path Expression.t = + Expression.map e.intern_key ~f:(fun column -> + Syntax.Path.{ column; alias = e.source }) + in + Syntax. + { + extern_key = e.extern_key; + intern_key; + target = e.target; + allow_missing = e.allow_missing; + match_rule = e.match_rule; + } + +type predicate = unit + +let predicate_of_yojson _ = () +let yojson_of_predicate () = `Null + +type t = Syntax.t = { + version : int; [@default current_syntax] + source : table; + externals : syntax_v1_extern list; [@default []] + columns : path expression list; + filters : path expression list; [@default []] [@yojson_drop_default ( = )] + sort : path expression list; [@default []] [@yojson_drop_default ( = )] + uniq : path expression list; [@default []] [@yojson_drop_default ( = )] +} +[@@deriving of_yojson] diff --git a/lib/configuration/read_conf.ml b/lib/configuration/read_conf.ml new file mode 100644 index 0000000..8d467a5 --- /dev/null +++ b/lib/configuration/read_conf.ml @@ -0,0 +1,216 @@ +open StdLabels +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +module ExpressionParser : sig + type 'a path_builder + + val path : Path.t path_builder + val column : Path.column path_builder + + val of_string : + 'a path_builder -> string -> ('a ImportExpression.T.t, string) result +end = struct + module MI = Expression_parser.MenhirInterpreter + module E = MenhirLib.ErrorReports + module L = MenhirLib.LexerUtil + + type error = { + message : string; + start_line : int; + start_pos : int; + end_pos : int; + } + + let range_message start_pos end_pos message = + let start_c = start_pos.Lexing.pos_cnum - start_pos.Lexing.pos_bol + and end_c = end_pos.Lexing.pos_cnum - end_pos.Lexing.pos_bol in + { + message; + start_line = start_pos.Lexing.pos_bol; + start_pos = start_c; + end_pos = end_c; + } + + (** Extract the line in error from the whole expression, and print some + characters just under the faulty part *) + let get_line_error : error -> string -> string = + fun error content -> + let sub_text = + try + let end_pos = String.index_from content error.start_line '\n' in + + String.sub content ~pos:error.start_line + ~len:(end_pos - error.start_line) + with + | Not_found -> + (* There is no new line, extract the ending part *) + let len = String.length content - error.start_line in + String.sub content ~pos:error.start_line ~len + in + (* I’m not sure how to produce it, but the error may be over two lines. + This line is here to prevent the underline to overflow. *) + let stop_pos = min error.end_pos (String.length sub_text) in + let error_length = stop_pos - error.start_pos in + String.concat ~sep:"" + [ + sub_text; + "\n"; + String.make error.start_pos ' '; + String.make error_length '^'; + ] + + let get_parse_error default_position env : error = + match MI.stack env with + | (lazy Nil) -> + range_message default_position.Lexing.lex_start_p + default_position.Lexing.lex_curr_p "Invalid syntax\n" + | (lazy (Cons (MI.Element (state, _, start_pos, end_pos), _))) -> + let message = + try Expression_parser_messages.message (MI.number state) with + | Not_found -> "Invalid syntax (no specific message for this eror)\n" + in + + range_message start_pos end_pos message + + type 'a path_builder = + Lexing.position -> 'a ImportExpression.T.t MI.checkpoint + + let rec _parse lexbuf (checkpoint : 'a ImportExpression.T.t MI.checkpoint) = + match checkpoint with + | MI.InputNeeded _env -> + let token = Expression_lexer.token lexbuf in + let startp = lexbuf.lex_start_p and endp = lexbuf.lex_curr_p in + let checkpoint = MI.offer checkpoint (token, startp, endp) in + _parse lexbuf checkpoint + | MI.Shifting _ | MI.AboutToReduce _ -> + let checkpoint = MI.resume checkpoint in + _parse lexbuf checkpoint + | MI.HandlingError _env -> + let err = get_parse_error lexbuf _env in + Error err + | MI.Accepted v -> Ok v + | MI.Rejected -> + let err = + range_message lexbuf.lex_start_p lexbuf.lex_curr_p + "invalid syntax (parser rejected the input)" + in + Error err + + let of_string : + 'a path_builder -> string -> ('a ImportExpression.T.t, string) result = + fun f str_expression -> + try + let lexbuf = Lexing.from_string str_expression in + let init = f lexbuf.lex_curr_p in + match _parse lexbuf init with + | Ok res -> Ok res + | Error e -> + let message = + String.concat ~sep:"\n" + [ e.message; get_line_error e str_expression ] + in + Error message + with + | Expression_lexer.UnclosedQuote { line; content } -> + let message = + Printf.sprintf "Unclosed quote at line %d : \"%s\"" line content + in + Error message + | e -> + let message = Printexc.to_string e in + Error message + + let path = Expression_parser.Incremental.path_expr + let column = Expression_parser.Incremental.column_expr +end + +module Make (S : Decoders.Decode.S) = struct + let ( let* ) = S.( let* ) + let ( and* ) = S.( and* ) + let ( >>= ) = S.( >>= ) + let ( >|= ) = S.( >|= ) + + class loader = + object (self) + method parse_expression : type a. + a ExpressionParser.path_builder -> + S.value -> + (a ImportExpression.T.t, S.value Decoders.Error.t) result = + fun path -> + S.string >>= fun v -> + match ExpressionParser.of_string path v with + | Ok expr -> S.succeed expr + | Error e -> S.fail_with Decoders.Error.(make e) + + method source = + let* file = S.field "file" S.string + and* name = S.field "name" S.string + and* tab = S.field_opt_or ~default:1 "tab" S.int in + S.succeed { Table.file; name; tab } + + method external_ name = + let* intern_key = + S.field "intern_key" (self#parse_expression ExpressionParser.path) + and* extern_key = + S.field "extern_key" (self#parse_expression ExpressionParser.column) + and* file = S.field "file" S.string + and* tab = S.field_opt_or ~default:1 "tab" S.int + and* allow_missing = + S.field_opt_or ~default:false "allow_missing" S.bool + in + + S.succeed + Syntax. + { + intern_key; + extern_key; + target = { name; file; tab }; + allow_missing; + match_rule = None; + } + + method sheet = + let* columns = + S.field "columns" + @@ S.list (self#parse_expression ExpressionParser.path) + and* filters = + S.field_opt_or ~default:[] "filters" + @@ S.list (self#parse_expression ExpressionParser.path) + and* sort = + S.field_opt_or ~default:[] "sort" + @@ S.list (self#parse_expression ExpressionParser.path) + and* uniq = + S.field_opt_or ~default:[] "uniq" + @@ S.list (self#parse_expression ExpressionParser.path) + in + S.succeed @@ fun version source externals -> + Syntax.{ version; source; externals; columns; filters; sort; uniq } + + method conf = + let* source = S.field "source" self#source + and* externals = + S.field_opt_or ~default:[] "externals" + (S.key_value_pairs_seq self#external_) + in + let* sheet = + S.field "sheet" self#sheet >|= fun v -> v 1 source externals + in + + S.succeed sheet + end + + let read_file file = + S.decode_file (new loader)#conf file + |> Result.map_error (fun v -> + let formatter = Format.str_formatter in + Format.fprintf formatter "%a@." S.pp_error v; + Format.flush_str_formatter ()) + + let read toml = + S.decode_value (new loader)#conf toml + |> Result.map_error (fun v -> + let formatter = Format.str_formatter in + Format.fprintf formatter "%a@." S.pp_error v; + Format.flush_str_formatter ()) +end diff --git a/lib/configuration/syntax.ml b/lib/configuration/syntax.ml new file mode 100644 index 0000000..8efdc59 --- /dev/null +++ b/lib/configuration/syntax.ml @@ -0,0 +1,88 @@ +open StdLabels +module E = ImportExpression.T +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +let toml_of_table Table.{ file; tab; name } = + let values = [ ("file", Otoml.string file); ("name", Otoml.string name) ] in + let values = + match tab with + | 1 -> values + | tab -> ("tab", Otoml.integer tab) :: values + in + + Otoml.table values + +type extern = { + intern_key : Path.t E.t; + target : Table.t; + extern_key : Path.column E.t; + allow_missing : bool; + match_rule : string option; +} + +let toml_of_extern extern = + let values = + [ + ( "intern_key", + Otoml.string + @@ ImportExpression.Repr.repr ~top:true Path.repr extern.intern_key ); + ( "extern_key", + Otoml.string + @@ ImportExpression.Repr.repr ~top:true + (fun v -> ":" ^ ImportCSV.Csv.column_to_string v) + extern.extern_key ); + ("file", Otoml.string extern.target.file); + ("allow_missing", Otoml.boolean extern.allow_missing); + ] + in + + let values = + match extern.target.tab with + | 1 -> values + | tab -> ("tab", Otoml.integer tab) :: values + in + + Otoml.table values + +let toml_of_externs externs = + List.map externs ~f:(fun e -> (e.target.name, toml_of_extern e)) + |> Otoml.table + +type t = { + version : int; + source : Table.t; + externals : extern list; + columns : Path.t E.t list; + filters : Path.t E.t list; + sort : Path.t E.t list; + uniq : Path.t E.t list; +} + +let repr t = + let repr_expression_list l = + Otoml.array + (List.map l ~f:(fun v -> + Otoml.string (ImportExpression.Repr.repr ~top:true Path.repr v))) + in + + let sheet = + Otoml.table + [ + ("columns", repr_expression_list t.columns); + ("filters", repr_expression_list t.filters); + ("sort", repr_expression_list t.sort); + ("uniq", repr_expression_list t.uniq); + ] + in + + let values = + [ + ("version", Otoml.integer t.version); + ("source", toml_of_table t.source); + ("externals", toml_of_externs t.externals); + ("sheet", sheet); + ] + in + + Otoml.table values diff --git a/lib/containers/dune b/lib/containers/dune new file mode 100755 index 0000000..46d0e24 --- /dev/null +++ b/lib/containers/dune @@ -0,0 +1,7 @@ +(library + (name importContainers) + (libraries + importDataTypes + importConf + ) +) diff --git a/lib/containers/importContainers.ml b/lib/containers/importContainers.ml new file mode 100644 index 0000000..bf65ba4 --- /dev/null +++ b/lib/containers/importContainers.ml @@ -0,0 +1,61 @@ +module Conf = ImportConf +module Syntax = Conf.Syntax +module Table = ImportDataTypes.Table + +(** This key is used to create the table of each externals in the + configuration. + + This table allow to check if there are cycles between the references *) +module KeyName : sig + type t + + val compare : t -> t -> int + val from_table : Table.t -> t + val hash : t -> int + val equal : t -> t -> bool +end = struct + type t = Table.t + + let compare v1 v2 = String.compare (Table.name v1) (Table.name v2) + (* We use the alias given in the configuration as key, because we want to + be sure there is no cycle in the dependencies. It’s OK to have the same + file used in differents sources, but the sources cannot induce cycles *) + + let from_table (s : Table.t) = s + let hash = Hashtbl.hash + let equal v1 v2 = String.equal (Table.name v1) (Table.name v2) +end + +module Source : sig + type t + + val compare : t -> t -> int + val from_table : Table.t -> t + val hash : t -> int + val equal : t -> t -> bool + val name : t -> KeyName.t +end = struct + type t = Table.t + + let compare v1 v2 = String.compare v1.Table.name v2.Table.name + (* We use the alias given in the configuration as key, because we want to + be sure there is no cycle in the dependencies. It’s OK to have the same + file used in differents sources, but the sources cannot induce cycles *) + + let from_table (s : Table.t) = s + let hash = Hashtbl.hash + let equal v1 v2 = String.equal v1.Table.name v2.Table.name + let name t = KeyName.from_table t +end + +module Externals = MoreLabels.Map.Make (KeyName) +module IntSet = MoreLabels.Set.Make (Int) + +let show_intSet set = + let b = Buffer.create 16 in + IntSet.iter + ~f:(fun v -> + Buffer.add_string b (string_of_int v); + Buffer.add_char b ',') + set; + Buffer.contents b diff --git a/lib/csv/csv.ml b/lib/csv/csv.ml new file mode 100644 index 0000000..db7329d --- /dev/null +++ b/lib/csv/csv.ml @@ -0,0 +1,30 @@ +open StdLabels + +type t = int + +let column_of_char = function + | 'A' .. 'Z' as c -> Char.code c - (Char.code 'A' - 1) + | 'a' .. 'z' as c -> Char.code c - (Char.code 'a' - 1) + | c -> raise (Invalid_argument ("column: " ^ Char.escaped c)) + +let column_of_string : string -> int = + fun s -> + String.fold_left s ~init:0 ~f:(fun value c -> (value * 26) + column_of_char c) + +(** Accumulate the remaining for the successives divisions in a list. *) +let rec _to_char ~b i = + if i > 0 then + let res = i mod 26 in + let res = if res = 0 then 26 else res in + + let c = char_of_int @@ (res + 64) in + (* The modulo is accumulated in the list head, which is the expected + sequence *) + let b = c :: b in + + _to_char ~b @@ ((i - res) / 26) + else b + +let column_to_string i = + let res = _to_char ~b:[] i in + List.to_seq res |> String.of_seq diff --git a/lib/csv/dataType.ml b/lib/csv/dataType.ml new file mode 100644 index 0000000..c582b9c --- /dev/null +++ b/lib/csv/dataType.ml @@ -0,0 +1,21 @@ +let match_date = Re.Str.regexp {|[0-9]+/[0-9]+/[0-9]+|} + +type t = + | Null + | Error of string + | Content of string + | Integer of int + | Float of float + +let to_string = function + | Null -> "" + | Error s -> s + | Integer i -> string_of_int i + | Float f -> string_of_float f + | Content c -> ( + match String.starts_with ~prefix:"0" c with + | false -> c + | true -> + (* If the string is a date, do not escape it *) + if Re.Str.string_match match_date c 0 then c + else String.concat "" [ "=\""; c; "\"" ]) diff --git a/lib/csv/dataType.mli b/lib/csv/dataType.mli new file mode 100644 index 0000000..ebb8bc7 --- /dev/null +++ b/lib/csv/dataType.mli @@ -0,0 +1,8 @@ +type t = + | Null + | Error of string + | Content of string + | Integer of int + | Float of float + +val to_string : t -> string diff --git a/lib/csv/dune b/lib/csv/dune new file mode 100755 index 0000000..b0f4a72 --- /dev/null +++ b/lib/csv/dune @@ -0,0 +1,6 @@ +(library + (name importCSV) + (libraries + re + ) +) diff --git a/lib/data_types/dune b/lib/data_types/dune new file mode 100644 index 0000000..e38310b --- /dev/null +++ b/lib/data_types/dune @@ -0,0 +1,10 @@ +(library + (name importDataTypes) + (libraries + importCSV + ) + + (preprocess (pps ppx_deriving.ord)) + ) + + diff --git a/lib/data_types/path.ml b/lib/data_types/path.ml new file mode 100644 index 0000000..6684b5a --- /dev/null +++ b/lib/data_types/path.ml @@ -0,0 +1,15 @@ +type column = int [@@deriving ord] + +type t = { + alias : string option; + (* External file to load, when the information is missing, load in + the current file *) + column : column; +} +[@@deriving ord] + +let repr { alias; column } = + let column_text = ImportCSV.Csv.column_to_string column in + match alias with + | None -> ":" ^ column_text + | Some value -> ":" ^ value ^ "." ^ column_text diff --git a/lib/data_types/readme.rst b/lib/data_types/readme.rst new file mode 100644 index 0000000..ac609d2 --- /dev/null +++ b/lib/data_types/readme.rst @@ -0,0 +1,4 @@ +This module contains all the types used in the application. + +It does not depends on any other library, and does not cause any dependency +cycle. diff --git a/lib/data_types/table.ml b/lib/data_types/table.ml new file mode 100644 index 0000000..d807c5c --- /dev/null +++ b/lib/data_types/table.ml @@ -0,0 +1,19 @@ +open StdLabels + +type t = { + file : string; + tab : int; + name : string; +} + +(** Get the internal name for the given table. + + This value may differ from the association name given in the configuration. *) +let name : t -> string = + fun source -> + let file_name = + source.file |> Filename.basename |> Filename.remove_extension + in + match source.tab with + | 1 -> file_name + | _ -> String.concat ~sep:"_" [ file_name; string_of_int source.tab ] diff --git a/lib/data_types/types.ml b/lib/data_types/types.ml new file mode 100644 index 0000000..37fd90f --- /dev/null +++ b/lib/data_types/types.ml @@ -0,0 +1,15 @@ +type t = + | Number + | String + | Bool + | None + | Extern + | Float + +let string_of_t : t -> string = function + | Number -> "Number" + | String -> "String" + | Bool -> "Bool" + | None -> "None" + | Extern -> "Extern" + | Float -> "Float" diff --git a/lib/errors/dune b/lib/errors/dune new file mode 100644 index 0000000..ab71219 --- /dev/null +++ b/lib/errors/dune @@ -0,0 +1,9 @@ +(library + (name importErrors) + (libraries + csv + sqlite3 + importCSV + importDataTypes + ) +) diff --git a/lib/errors/importErrors.ml b/lib/errors/importErrors.ml new file mode 100644 index 0000000..04f9deb --- /dev/null +++ b/lib/errors/importErrors.ml @@ -0,0 +1,98 @@ +open StdLabels +module CSV = ImportCSV +module Table = ImportDataTypes.Table + +let bom = "\xEF\xBB\xBF" + +type xlsError = { + source : Table.t; + row : int; + sheet : int; + target : Table.t option; + value : CSV.DataType.t; + exn : exn; +} + +exception + JsonError of { + json : string; + element : string; + } + +exception InvalidEncoding of string +exception NullKey of int +exception SqlError of Sqlite3.Rc.t +exception MisplacedWindow +exception Unknown_source of string +exception Unknown_extension of string + +exception Cycle of string list +(** Cycle between the dpendencies *) + +exception + TypeError of { + expected : ImportDataTypes.Types.t; + actual : ImportDataTypes.Types.t; + expression : string; + subset : string; + } + +let repr_error = function + | SqlError s -> Printf.sprintf "%s Error" (Sqlite3.Rc.to_string s) + | JsonError { json; element } -> Printf.sprintf "%s : %s" element json + | NullKey k -> + Printf.sprintf "The key %s is null" (ImportCSV.Csv.column_to_string k) + | Unknown_source source -> + Printf.sprintf "The source %s is referenced without beiing declared" + source + | MisplacedWindow -> "A group function cannot appear here" + | TypeError { expected; actual; expression; subset } -> + Printf.sprintf + "In this expression %s has type %s but %s was expected:\n%s" subset + (ImportDataTypes.Types.string_of_t actual) + (ImportDataTypes.Types.string_of_t expected) + expression + | Unknown_extension ext -> Printf.sprintf "Unknown file extension %s" ext + | Cycle deps -> + Printf.sprintf "Cycle between the dependencies : %s" + (String.concat ~sep:"," deps) + | Failure e -> e + | e -> + Printexc.print_backtrace Stdlib.stdout; + Printexc.to_string e + +type t = Csv.out_channel Lazy.t + +let output_error : t -> xlsError -> unit = + fun csv error -> + let target = + match error.target with + | None -> "" + | Some value -> value.name + in + + Csv.output_record (Lazy.force csv) + [ + Table.name error.source; + string_of_int error.sheet; + string_of_int error.row; + target; + CSV.DataType.to_string error.value; + repr_error error.exn; + ] + +let log : with_bom:bool -> string -> string -> t = + fun ~with_bom prefix dirname -> + lazy + (let file = Filename.concat dirname (String.cat prefix "_errors.csv") in + + prerr_endline @@ String.cat "Errors found. See the file " file; + + (* Open the error file *) + let channel = Out_channel.open_bin file in + if with_bom then output_string channel bom; + let log_error = Csv.to_channel ~separator:';' ~excel_tricks:true channel in + (* Generate a header for the error file *) + Csv.output_record log_error + [ "Source"; "Tab"; "Row"; "Target"; "Value"; "Error" ]; + log_error) diff --git a/lib/errors/importErrors.mli b/lib/errors/importErrors.mli new file mode 100644 index 0000000..7d17a5d --- /dev/null +++ b/lib/errors/importErrors.mli @@ -0,0 +1,46 @@ +module CSV = ImportCSV + +exception SqlError of Sqlite3.Rc.t +exception MisplacedWindow + +exception Unknown_source of string +(** Exception raised when a source used in a path is not declared as an + external *) + +exception Cycle of string list +(** Cycle between the dpendencies *) + +exception Unknown_extension of string + +exception + JsonError of { + json : string; + element : string; + } + +type xlsError = { + source : ImportDataTypes.Table.t; + row : int; + sheet : int; + target : ImportDataTypes.Table.t option; + value : CSV.DataType.t; + exn : exn; +} + +exception + TypeError of { + expected : ImportDataTypes.Types.t; + actual : ImportDataTypes.Types.t; + expression : string; + subset : string; + } + +exception InvalidEncoding of string +exception NullKey of int + +val repr_error : exn -> string + +type t = Csv.out_channel Lazy.t + +val output_error : t -> xlsError -> unit +val log : with_bom:bool -> string -> string -> t diff --git a/lib/expression/ast.ml b/lib/expression/ast.ml new file mode 100644 index 0000000..ef083e9 --- /dev/null +++ b/lib/expression/ast.ml @@ -0,0 +1,31 @@ +(** This module rebuilds an AST from an evaluation *) + +type 'a repr = 'a T.t +type 'a obs = 'a T.t +type 'a path_repr = unit + +let observe : 'a repr -> 'a obs = Fun.id +let empty : unit -> 'a repr = fun () -> T.Empty +let expr : 'a repr -> 'a repr = fun t -> T.Expr t +let literal : string -> 'a repr = fun s -> T.Literal s +let integer : string -> 'a repr = fun i -> T.Integer i +let path : 'a path_repr -> 'a -> 'a repr = fun _repr p -> T.Path p +let concat : 'a repr list -> 'a repr = fun ll -> T.Concat ll + +let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun w groups order -> T.Window (w, groups, order) + +let nvl : 'a repr list -> 'a repr = fun ll -> T.Nvl ll +let join : string -> 'a repr list -> 'a repr = fun s ll -> T.Join (s, ll) + +let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op e1 e2 -> T.BOperator (op, e1, e2) + +let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op e1 ll -> T.GEquality (op, e1, ll) + +let funct : string -> 'a repr list -> 'a repr = + fun name args -> T.Function (name, args) + +let function' : T.funct -> 'a repr list -> 'a repr = + fun f args -> T.Function' (f, args) diff --git a/lib/expression/compose.ml b/lib/expression/compose.ml new file mode 100644 index 0000000..028602b --- /dev/null +++ b/lib/expression/compose.ml @@ -0,0 +1,150 @@ +open StdLabels + +(** Build an expression module with the result from another expression. The + signature of the fuctions is a bit different, as they all receive the + result from the previous evaluated element in argument. *) +module Expression + (E : Sym.SYM_EXPR) + (R : sig + val v : 'a E.path_repr + end) = +struct + module type SIG = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : 'a E.obs -> 'a repr + val expr : 'a E.obs * 'a repr -> 'a E.obs -> 'a repr + val literal : string -> 'a E.obs -> 'a repr + val integer : string -> 'a E.obs -> 'a repr + val path : 'a path_repr -> 'a -> 'a E.obs -> 'a repr + val concat : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val window : + ('a E.obs * 'a repr) T.window -> + ('a E.obs * 'a repr) list -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val nvl : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val join : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val boperator : + T.binary_operator -> + 'a E.obs * 'a repr -> + 'a E.obs * 'a repr -> + 'a E.obs -> + 'a repr + + val gequality : + T.binary_operator -> + 'a E.obs * 'a repr -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val funct : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val function' : T.funct -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val observe : 'a E.obs * 'a repr -> 'a obs + end + + module Make (M : SIG) = struct + type 'a repr = 'a E.repr * 'a M.repr + type 'a obs = 'a M.obs + type 'a path_repr = 'a M.path_repr + + let map' : 'a repr list -> 'a E.repr list * ('a E.obs * 'a M.repr) list = + fun ll -> + let e = List.map ~f:fst ll in + (e, List.map ll ~f:(fun (e, m) -> (E.observe e, m))) + + let observe : 'a repr -> 'a obs = fun (t, v) -> M.observe (E.observe t, v) + + let empty : unit -> 'a repr = + fun () -> + let e = E.empty () in + (e, M.empty (E.observe e)) + + let expr : 'a repr -> 'a repr = + fun (e, m) -> + let e' = E.expr e in + (e', M.expr (E.observe e, m) (E.observe e')) + + let literal : string -> 'a repr = + fun litt -> + let e = E.literal litt in + (e, M.literal litt (E.observe e)) + + let integer : string -> 'a repr = + fun i -> + let e' = E.integer i in + (e', M.integer i (E.observe e')) + + let path : 'b path_repr -> 'b -> 'a repr = + fun path_repr path -> + let e = E.path R.v path in + let m = M.path path_repr path (E.observe e) in + (e, m) + + let concat : 'a repr list -> 'a repr = + fun reprs -> + let e, m = map' reprs in + let e' = E.concat e in + (e', M.concat m (E.observe e')) + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun window expressions order -> + let e_expressions, m_expressions = map' expressions + and e_order, m_order = map' order + and e_window = T.map_window window ~f:fst + and m_window = T.map_window window ~f:(fun (e, m) -> (E.observe e, m)) in + + let e = E.window e_window e_expressions e_order in + (e, M.window m_window m_expressions m_order (E.observe e)) + + let nvl : 'a repr list -> 'a repr = + fun reprs -> + let e, m = List.split reprs in + + let e' = E.nvl e in + let e = List.map ~f:E.observe e in + (e', M.nvl (List.combine e m) (E.observe e')) + + let join : string -> 'a repr list -> 'a repr = + fun sep reprs -> + let e_reprs, m = map' reprs in + + let e = E.join sep e_reprs in + (e, M.join sep m (E.observe e)) + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op (e1, m1) (e2, m2) -> + let e1' = E.observe e1 + and e2' = E.observe e2 + and e = E.boperator op e1 e2 in + let m' = M.boperator op (e1', m1) (e2', m2) (E.observe e) in + (e, m') + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op (e1, m1) exprs -> + let e_reprs, m_reprs = map' exprs in + let e' = E.gequality op e1 e_reprs in + let m' = M.gequality op (E.observe e1, m1) m_reprs (E.observe e') in + (e', m') + + let funct : string -> 'a repr list -> 'a repr = + fun sep reprs -> + let e_reprs, m = map' reprs in + + let e = E.funct sep e_reprs in + (e, M.funct sep m (E.observe e)) + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f reprs -> + let e_reprs, m = map' reprs in + let e = E.function' f e_reprs in + (e, M.function' f m (E.observe e)) + end +end diff --git a/lib/expression/compose.mli b/lib/expression/compose.mli new file mode 100644 index 0000000..4cced8c --- /dev/null +++ b/lib/expression/compose.mli @@ -0,0 +1,59 @@ +(** Build an expression module with the result from another expression. The + signature of the fuctions is a bit different, as they all receive the + result from the previous evaluated element in argument. *) +module Expression + (E : Sym.SYM_EXPR) + (_ : sig + val v : 'a E.path_repr + end) : sig + (** The signature for the module the adapt is a bit different for the + SYM_EXPR: every function takes an extra argument which is the Expression + we are wrapping, and every expression becomes a tuple with the same + expression represented in the composed type. *) + module type SIG = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : 'a E.obs -> 'a repr + val expr : 'a E.obs * 'a repr -> 'a E.obs -> 'a repr + val literal : string -> 'a E.obs -> 'a repr + val integer : string -> 'a E.obs -> 'a repr + val path : 'a path_repr -> 'a -> 'a E.obs -> 'a repr + val concat : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val window : + ('a E.obs * 'a repr) T.window -> + ('a E.obs * 'a repr) list -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val nvl : ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val join : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + + val boperator : + T.binary_operator -> + 'a E.obs * 'a repr -> + 'a E.obs * 'a repr -> + 'a E.obs -> + 'a repr + + val gequality : + T.binary_operator -> + 'a E.obs * 'a repr -> + ('a E.obs * 'a repr) list -> + 'a E.obs -> + 'a repr + + val funct : string -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val function' : T.funct -> ('a E.obs * 'a repr) list -> 'a E.obs -> 'a repr + val observe : 'a E.obs * 'a repr -> 'a obs + end + + module Make (M : SIG) : + Sym.SYM_EXPR + with type 'a obs = 'a M.obs + and type 'a repr = 'a E.repr * 'a M.repr + and type 'a path_repr = 'a M.path_repr +end diff --git a/lib/expression/dune b/lib/expression/dune new file mode 100755 index 0000000..96e386e --- /dev/null +++ b/lib/expression/dune @@ -0,0 +1,9 @@ +(library + (name importExpression) + (libraries + re + importCSV + importDataTypes + importErrors + ) +) diff --git a/lib/expression/filters.ml b/lib/expression/filters.ml new file mode 100644 index 0000000..42c794b --- /dev/null +++ b/lib/expression/filters.ml @@ -0,0 +1,193 @@ +(** This module evaluate the sql query to use in order to filter an expression + + The result is built over [Query] except for the group function, which are + translated into a CTE in sql + *) + +open StdLabels +module Q = Query + +type 'a result = { + repr : Format.formatter -> nested:Query.QueryParameter.t -> unit; + group : 'a T.t option; +} + +module Filter = struct + type 'a repr = { + repr : 'a Q.Query.repr; + with_group : 'a T.t option; + } + + type 'a obs = 'a result + type 'a path_repr = 'a Q.Query.path_repr + + let observe : 'a Ast.obs * 'a repr -> 'a obs = + fun (_, v) -> { repr = Q.Query.observe v.repr; group = v.with_group } + + let empty : 'a Ast.obs -> 'a repr = + fun _ -> { repr = Q.Query.empty (); with_group = None } + + let expr : 'a Ast.obs * 'a repr -> 'a Ast.obs -> 'a repr = + fun (_, expr) _ -> + { repr = Q.Query.expr expr.repr; with_group = expr.with_group } + + let path : 'a path_repr -> 'a -> 'a Ast.obs -> 'a repr = + fun repr p _ -> { repr = Q.Query.path repr p; with_group = None } + + let literal : string -> 'a Ast.obs -> 'a repr = + fun l _ -> { repr = Q.Query.literal l; with_group = None } + + let integer : string -> 'a Ast.obs -> 'a repr = + fun l _ -> { repr = Q.Query.integer l; with_group = None } + + let nvl : ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.nvl expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let concat : ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.concat expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let join : string -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun sep expression _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expression + and with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expression + in + match with_group with + | None -> { repr = Q.Query.join sep expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let boperator : + T.binary_operator -> + 'a Ast.obs * 'a repr -> + 'a Ast.obs * 'a repr -> + 'a Ast.obs -> + 'a repr = + fun name (_, e1) (_, e2) _ -> + let with_group = + match (e1.with_group, e2.with_group) with + | Some e, None -> Some e + | None, Some e -> Some e + | None, None -> None + | _ -> raise ImportErrors.MisplacedWindow + in + { repr = Q.Query.boperator name e1.repr e2.repr; with_group } + + let gequality : + T.binary_operator -> + 'a Ast.obs * 'a repr -> + ('a Ast.obs * 'a repr) list -> + 'a Ast.obs -> + 'a repr = + fun name (_, e1) group _ -> + let group_repr = List.map ~f:(fun v -> (snd v).repr) group + and with_group = List.find_map ~f:(fun v -> (snd v).with_group) group in + + match with_group with + | None -> + { + repr = Q.Query.gequality name e1.repr group_repr; + with_group = e1.with_group; + } + | _ -> raise ImportErrors.MisplacedWindow + + let funct : string -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun name expressions _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expressions in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + in + match with_group with + | None -> { repr = Q.Query.funct name expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + let function' : + T.funct -> ('a Ast.obs * 'a repr) list -> 'a Ast.obs -> 'a repr = + fun name expressions _ -> + let expr_repr = List.map ~f:(fun v -> (snd v).repr) expressions in + let with_group = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + in + match with_group with + | None -> + { repr = Q.Query.funct (T.name_of_function name) expr_repr; with_group } + | Some _ -> raise ImportErrors.MisplacedWindow + + (** Window functions are not handled in the filters, we save them as an AST + in order to process them in a separated handler. + + It is not allowed to build nested window functions. *) + let window : + ('a Ast.obs * 'a repr) T.window -> + ('a Ast.obs * 'a repr) list -> + ('a Ast.obs * 'a repr) list -> + 'a Ast.obs -> + 'a repr = + fun name expressions order ast -> + ignore name; + let with_group_expr = + List.find_map ~f:(fun v -> (snd v).with_group) expressions + and with_group_order = + List.find_map ~f:(fun v -> (snd v).with_group) order + in + match (with_group_expr, with_group_order) with + | Some _, _ | _, Some _ -> raise ImportErrors.MisplacedWindow + | None, None -> + (* The column name used with the cte. The name is fixed here, and used + as is in [Analysers.Query.build_cte] and + [Analysers.Query.eval_filters] *) + let q = "cte.group0" in + { + with_group = Some ast; + repr = Q.Query.funct "expr" [ Q.Query.literal q ]; + } +end + +module ASTBuilder = + Compose.Expression + (Ast) + (struct + let v = () + end) + +module F : + Sym.SYM_EXPR + with type 'a obs = 'a result + and type 'a path_repr = Format.formatter -> 'a -> unit = + ASTBuilder.Make (Filter) + +module M = Sym.M (F) + +let query_of_expression : + type b. + b Q.binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + b * 'a T.t option = + fun parameter formatter printer expr -> + let repr = M.eval ~path_repr:printer expr in + match parameter with + | BindParam -> + let p = Queue.create () in + let parameter = Q.QueryParameter.Queue p in + let value = F.observe repr in + value.repr ~nested:parameter formatter; + (p, value.group) + | NoParam -> + let value = F.observe repr in + value.repr ~nested:Literal formatter; + ((), value.group) diff --git a/lib/expression/filters.mli b/lib/expression/filters.mli new file mode 100644 index 0000000..d462b5f --- /dev/null +++ b/lib/expression/filters.mli @@ -0,0 +1,9 @@ +module F : Sym.SYM_EXPR with type 'a path_repr = Format.formatter -> 'a -> unit +(** Query used inside the filter clauses *) + +val query_of_expression : + 'b Query.binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + 'b * 'a T.t option diff --git a/lib/expression/headers.ml b/lib/expression/headers.ml new file mode 100644 index 0000000..6371e4f --- /dev/null +++ b/lib/expression/headers.ml @@ -0,0 +1,89 @@ +open StdLabels + +let truncate buffer n = Buffer.truncate buffer (Buffer.length buffer - n) + +module E : + Sym.SYM_CHUNK + with type 'a obs = buffer:Buffer.t -> unit + and type 'a path_repr = 'a -> Buffer.t -> unit = struct + type 'a repr = buffer:Buffer.t -> unit + type 'a obs = buffer:Buffer.t -> unit + type 'a path_repr = 'a -> Buffer.t -> unit + + let group : 'a repr list -> 'a repr = + fun args ~buffer -> + Buffer.add_string buffer "["; + List.iter args ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer ", "); + + truncate buffer 2; + Buffer.add_string buffer "]" + + let arguments : 'a repr list -> 'a repr = + fun expressions ~buffer -> + Buffer.add_string buffer "("; + List.iter expressions ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer ", "); + + truncate buffer 2; + Buffer.add_string buffer ")" + + let observe x ~buffer = x ~buffer + let empty : unit -> 'a repr = fun _ ~buffer -> Buffer.add_string buffer "''" + let path printer p ~buffer = printer p buffer + let literal l ~buffer = Buffer.add_string buffer l + let integer l ~buffer = Buffer.add_string buffer l + + let expr expr ~buffer = + Buffer.add_char buffer '('; + expr ~buffer; + Buffer.add_char buffer ')' + + let nvl expression ~buffer = + Buffer.add_string buffer "nvl"; + arguments ~buffer expression + + let concat expression ~buffer = List.iter expression ~f:(fun v -> v ~buffer) + + let join sep expression ~buffer = + List.iter expression ~f:(fun v -> + v ~buffer; + Buffer.add_string buffer sep); + truncate buffer (String.length sep) + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun name expressions order ~buffer -> + ignore order; + let name = T.name_of_window name in + + Buffer.add_string buffer name; + arguments ~buffer expressions + + let boperator name e1 e2 ~buffer = + e1 ~buffer; + Buffer.add_string buffer (T.name_of_operator name); + e2 ~buffer + + let gequality name e1 e2 ~buffer = + e1 ~buffer; + Buffer.add_string buffer (T.name_of_operator name); + group ~buffer e2 + + let funct name expressions ~buffer = + Buffer.add_string buffer name; + arguments ~buffer expressions + + let function' name expressions ~buffer = + Buffer.add_string buffer (T.name_of_function name); + arguments ~buffer expressions +end + +module M = Sym.M (E) + +let headers_of_expression : + Buffer.t -> ('a -> Buffer.t -> unit) -> 'a T.t -> unit = + fun buffer printer expr -> + let repr = M.eval expr ~path_repr:printer in + E.observe repr ~buffer diff --git a/lib/expression/headers.mli b/lib/expression/headers.mli new file mode 100644 index 0000000..1fafad0 --- /dev/null +++ b/lib/expression/headers.mli @@ -0,0 +1,7 @@ +val headers_of_expression : + Buffer.t -> ('a -> Buffer.t -> unit) -> 'a T.t -> unit + +module E : + Sym.SYM_EXPR + with type 'a obs = buffer:Buffer.t -> unit + and type 'a path_repr = 'a -> Buffer.t -> unit diff --git a/lib/expression/lazier.ml b/lib/expression/lazier.ml new file mode 100644 index 0000000..d8b12d9 --- /dev/null +++ b/lib/expression/lazier.ml @@ -0,0 +1,71 @@ +open StdLabels + +(** Make a module lazy *) +module Make (S : Sym.SYM_EXPR) = struct + type 'a repr = 'a S.repr Lazy.t + type 'a obs = 'a S.obs Lazy.t + type 'a path_repr = 'a S.path_repr + + let empty : unit -> 'a repr = fun () -> lazy (S.empty ()) + + let expr : 'a repr -> 'a repr = + fun expr -> Lazy.map (fun expr -> S.expr expr) expr + + let literal : string -> 'a repr = fun l -> lazy (S.literal l) + let integer : string -> 'a repr = fun i -> lazy (S.integer i) + + let path : 'b path_repr -> 'b -> 'a repr = + fun repr path -> lazy (S.path repr path) + + let concat : 'a repr list -> 'a repr = + fun exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.concat exprs') + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun w group sort -> + lazy + (let w' = T.map_window ~f:Lazy.force w + and group' = List.map ~f:Lazy.force group + and sort' = List.map ~f:Lazy.force sort in + S.window w' group' sort') + + let nvl : 'a repr list -> 'a repr = + fun exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.nvl exprs') + + let join : string -> 'a repr list -> 'a repr = + fun sep exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.join sep exprs') + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op e1 e2 -> + lazy + (let e1' = Lazy.force e1 and e2' = Lazy.force e2 in + S.boperator op e1' e2') + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op e exprs -> + lazy + (let e' = Lazy.force e and exprs' = List.map ~f:Lazy.force exprs in + S.gequality op e' exprs') + + let funct : string -> 'a repr list -> 'a repr = + fun name exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.funct name exprs') + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f exprs -> + lazy + (let exprs' = List.map ~f:Lazy.force exprs in + S.function' f exprs') + + let observe : 'a repr -> 'a obs = fun v -> Lazy.map S.observe v +end diff --git a/lib/expression/query.ml b/lib/expression/query.ml new file mode 100644 index 0000000..5bd914a --- /dev/null +++ b/lib/expression/query.ml @@ -0,0 +1,335 @@ +(** + This module create an sql query from an expression. + *) + +open StdLabels + +(** This type is used in the query builder (see [query_of_expression] just + below in order to tell if we need to bind the parameters in the query, or + if we can use plain literal as is (with some risk at the execution time. *) +type _ binded_query = + | BindParam : ImportCSV.DataType.t Queue.t binded_query + | NoParam : unit binded_query + +module QueryParameter = struct + (** Internaly, we need to keep a different type for the Literal chunks + (which requires to be quoted), and raw (which should be given as is to the + sql engine) + + The Raw can be generated from both BindParam or NoParam queries. *) + type t = + | Literal + | Queue of ImportCSV.DataType.t Queue.t + | Raw of t + + (** Wrap the given parameter mode into the raw mode *) + let raw : t -> t = function + | Raw t -> Raw t + | Literal -> Raw Literal + | Queue q -> Raw (Queue q) + + (** Nest the parameter in order to use it inside another function call. + + The rule is to get out of the Raw mode as soon as we dive into another + one function. *) + let nest : t -> t = function + | Raw t -> t + | other -> other +end + +module TypeBuilder = + Compose.Expression + (Type_of) + (struct + let v = ignore + end) + +module Query = TypeBuilder.Make (struct + type 'a repr = Format.formatter -> nested:QueryParameter.t -> unit + type 'a obs = Format.formatter -> nested:QueryParameter.t -> unit + type 'a path_repr = Format.formatter -> 'a -> unit + + let observe : 'a Type_of.obs * 'a repr -> 'a obs = + fun (_, x) formatter ~nested -> + let () = x formatter ~nested in + Format.pp_print_flush formatter () + + (** Unify an external reference with a given type, using the COALESCE + function *) + let unify : + with_:Type_of.t -> + nested:QueryParameter.t -> + Format.formatter -> + 'a Type_of.obs * 'a repr -> + unit = + fun ~with_ ~nested format (type_of, expr) -> + match (type_of, with_) with + | ImportDataTypes.Types.Extern, Number + | ImportDataTypes.Types.Extern, Extern -> + Format.fprintf format "COALESCE(%a,0)" + (fun f expr -> expr f ~nested) + expr + | ImportDataTypes.Types.Extern, String -> + Format.fprintf format "COALESCE(%a,'')" + (fun f expr -> expr f ~nested) + expr + | _, Float -> + Format.fprintf format "CAST(%a AS REAL)" + (fun f expr -> expr f ~nested) + expr + | _, _ -> expr ~nested format + + let empty : 'a Type_of.obs -> 'a repr = + fun type_of formatter ~nested -> + ignore type_of; + ignore nested; + Format.fprintf formatter "''" + + let expr : 'a Type_of.obs * 'a repr -> 'a Type_of.obs -> 'a repr = + fun expr type_of formatter ~nested -> + ignore type_of; + Format.fprintf formatter "("; + (snd expr) ~nested formatter; + Format.fprintf formatter ")" + + let literal : string -> 'a Type_of.obs -> 'a repr = + fun l type_of formatter ~nested -> + ignore type_of; + match nested with + | QueryParameter.Literal -> + (* If the text is a true literal, we insert it directly. This is + only called from the [query_of_expression] function *) + Format.fprintf formatter "'%s'" l + | QueryParameter.Queue queue -> + Format.fprintf formatter "?"; + Queue.add (ImportCSV.DataType.Content l) queue + | QueryParameter.Raw _ -> Format.fprintf formatter "%s" l + + let integer : string -> 'a Type_of.obs -> 'a repr = + fun l type_of formatter ~nested -> + ignore type_of; + ignore nested; + Format.fprintf formatter "%s" l + + let path : 'b path_repr -> 'b -> 'a Type_of.obs -> 'a repr = + fun repr p type_of formatter ~nested -> + ignore nested; + ignore type_of; + repr formatter p + + let concat : ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f " || ") + (unify ~with_:ImportDataTypes.Types.String ~nested:nested') + formatter expression + + let print_expression : + ?sep:string -> + QueryParameter.t -> + Format.formatter -> + ('a Type_of.obs * 'a repr) list -> + unit = + fun ?(sep = ", ") nested formatter expression -> + (Format.pp_print_list + ~pp_sep:(fun f () -> Format.fprintf f "%s" sep) + (fun f v -> (snd v) f ~nested)) + formatter expression + + (** Format the partition expression. This function is used internally and + only form the expression inside the clause. *) + let group_windows : + QueryParameter.t -> + Format.formatter -> + ('a Type_of.obs * 'a repr) list + * ('a Type_of.obs * 'a repr) list + * string option -> + unit = + fun nested formatter (expressions, order, range) -> + match (expressions, order) with + | [], _ -> () + | _, [] -> + Format.fprintf formatter " OVER (PARTITION BY %a%a)" + (print_expression nested) expressions + (Format.pp_print_option (fun f v -> Format.fprintf f "%s" v)) + range + | _, _ -> + Format.fprintf formatter " OVER (PARTITION BY %a ORDER BY %a%a)" + (print_expression nested) expressions (print_expression nested) order + (Format.pp_print_option (fun f v -> Format.fprintf f "%s" v)) + range + + let window : + ('a Type_of.obs * 'a repr) T.window -> + ('a Type_of.obs * 'a repr) list -> + ('a Type_of.obs * 'a repr) list -> + 'a Type_of.obs -> + 'a repr = + fun name expressions order type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + (* By default, the range is defined like this + + [RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW EXCLUDE NO OTHERS] + + this only build a range until the current row, but in some cases (min, + last), we want to scan the whole group in order to evaluate the value to + keep. + *) + let range = " RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in + match name with + | T.Min expr -> + Format.fprintf formatter "FIRST_VALUE(%a)%a " + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') + (expressions, order, Some range) + | T.Max expr -> + Format.fprintf formatter "LAST_VALUE(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') + (expressions, order, Some range) + | T.Counter -> + (* If no order is given, return the number of elements in the + whole group *) + let operator = + match order with + | [] -> "COUNT" + | _ -> "ROW_NUMBER" + in + Format.fprintf formatter "%s()%a" operator (group_windows nested') + (expressions, order, None) + | T.Previous expr -> + Format.fprintf formatter "LAG(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') (expressions, order, None) + | T.Sum expr -> + Format.fprintf formatter "SUM(%a)%a" + (fun f v -> (snd v) f ~nested:nested') + expr (group_windows nested') (expressions, order, None) + + let nvl : ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + Format.fprintf formatter "COALESCE(%a)" (print_expression nested') + expression + + let join : + string -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun sep expression type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + + (* Directly call the literal function for the first argument *) + Format.fprintf formatter "CONCAT(%a, %a)" + (fun f v -> (literal v ImportDataTypes.Types.String) f ~nested:nested') + sep (print_expression nested') expression + + let boperator : + T.binary_operator -> + 'a Type_of.obs * 'a repr -> + 'a Type_of.obs * 'a repr -> + 'a Type_of.obs -> + 'a repr = + fun name e1 e2 type_of formatter ~nested -> + ignore type_of; + (* When dividing, we need to be sure that the type is a float, + otherwise SQL will truncate the result *) + let with_ = + match name with + | T.Division -> ImportDataTypes.Types.Float + | _ -> fst e2 + in + + let nested' = QueryParameter.nest nested in + Format.fprintf formatter "%a%s%a" + (unify ~with_ ~nested:nested') + e1 + (* The operator *) + (T.name_of_operator name) + (unify ~with_:(fst e1) ~nested:nested') + e2 + + let gequality : + T.binary_operator -> + 'a Type_of.obs * 'a repr -> + ('a Type_of.obs * 'a repr) list -> + 'a Type_of.obs -> + 'a repr = + fun name e1 group type_of -> + ignore type_of; + let group_type = List.map ~f:fst group in + fun formatter ~nested -> + let nested' = QueryParameter.nest nested in + let op_name = + match name with + | T.Equal -> " IN(" + | T.Different -> " NOT IN(" + | _ -> "" + in + + Format.fprintf formatter "%a%s%a)" + (unify ~with_:(Type_of.group' group_type) ~nested:nested') + e1 op_name (print_expression nested') group + + let exprs expressions formatter ~nested = + (* Literal expression, starting from now, all the quoted string are + directly given to the sql engine *) + let nested' = QueryParameter.raw nested in + + Format.fprintf formatter "(%a)" + (print_expression ~sep:" " nested') + expressions + + let rec funct : + string -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun name expressions type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + match name with + | "expr" -> + (* Raw expression are parsed directly *) + exprs expressions formatter ~nested + | "if" -> + (* The if is renamed into IIF *) + funct "IIF" expressions type_of formatter ~nested + | _ -> + (* Default case *) + Format.fprintf formatter "%s(%a)" name (print_expression nested') + expressions + + let function' : + T.funct -> ('a Type_of.obs * 'a repr) list -> 'a Type_of.obs -> 'a repr = + fun name expressions type_of formatter ~nested -> + ignore type_of; + let nested' = QueryParameter.nest nested in + match name with + | Upper | Trim -> + Format.fprintf formatter "%s(%a)" (T.name_of_function name) + (print_expression nested') expressions +end) + +module M = Sym.M (Query) + +let query_of_expression : + type b. + b binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + b = + fun parameter formatter printer expr -> + let repr = M.eval ~path_repr:printer expr in + match parameter with + | BindParam -> + let p = Queue.create () in + let parameter = QueryParameter.Queue p in + Query.observe repr formatter ~nested:parameter; + p + | NoParam -> + Query.observe repr formatter ~nested:Literal; + () diff --git a/lib/expression/query.mli b/lib/expression/query.mli new file mode 100644 index 0000000..fa789a9 --- /dev/null +++ b/lib/expression/query.mli @@ -0,0 +1,27 @@ +module QueryParameter : sig + (** Internaly, we need to keep a different type for the Literal chunks + (which requires to be quoted), and raw (which should be given as is to the + sql engine) + + The Raw can be generated from both BindParam or NoParam queries. *) + type t = + | Literal + | Queue of ImportCSV.DataType.t Queue.t + | Raw of t +end + +type _ binded_query = + | BindParam : ImportCSV.DataType.t Queue.t binded_query + | NoParam : unit binded_query + +val query_of_expression : + 'b binded_query -> + Format.formatter -> + (Format.formatter -> 'a -> unit) -> + 'a T.t -> + 'b + +module Query : + Sym.SYM_EXPR + with type 'a obs = Format.formatter -> nested:QueryParameter.t -> unit + and type 'a path_repr = Format.formatter -> 'a -> unit diff --git a/lib/expression/repr.ml b/lib/expression/repr.ml new file mode 100644 index 0000000..4990236 --- /dev/null +++ b/lib/expression/repr.ml @@ -0,0 +1,127 @@ +open StdLabels + +let escape_dquote = Re.Str.regexp "'" +let escape content = Re.Str.global_replace escape_dquote "\\'" content + +module E : + Sym.SYM_CHUNK + with type 'a obs = top:bool -> string + and type 'a path_repr = 'a -> string = struct + type 'a repr = top:bool -> string + type 'a obs = top:bool -> string + type 'a path_repr = 'a -> string + + let observe x = x + + let group : 'a repr list -> 'a repr = + fun args ~top -> + let args_repr = List.map ~f:(fun v -> v ~top) args in + let args = String.concat ~sep:", " args_repr in + "[" ^ args ^ "]" + + let arguments : 'a repr list -> 'a repr = + fun args ~top -> + let args_repr = List.map ~f:(fun v -> v ~top) args in + let args = String.concat ~sep:", " args_repr in + "(" ^ args ^ ")" + + let empty : unit -> 'a repr = + fun () ~top -> + match top with + | false -> "''" + | true -> "" + + let literal : string -> 'a repr = + fun l ~top -> + if String.equal String.empty l then (empty ()) ~top + else + match int_of_string_opt l with + | Some _ -> l + | None -> "'" ^ escape l ^ "'" + + let integer : string -> 'a repr = + fun l ~top -> if String.equal String.empty l then (empty ()) ~top else l + + let expr : 'a repr -> 'a repr = + fun expr ~top -> + ignore top; + String.concat ~sep:"" [ "("; expr ~top:false; ")" ] + + let path : 'b path_repr -> 'b -> 'a repr = + fun path_repr p ~top -> + ignore top; + path_repr p + + let concat : 'a repr list -> 'a repr = + fun elems ~top -> + ignore top; + let top = false in + let strs = List.map elems ~f:(fun v -> v ~top) in + String.concat ~sep:" ^ " strs + + let window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr = + fun name g1 sort ~top -> + ignore top; + + let args1 = group ~top:false g1 + and args2 = group ~top:false sort + and f_name = T.name_of_window name in + let args = [ args1; args2 ] in + let args = + match name with + | T.Counter -> args + | T.Min prefix_arg + | T.Max prefix_arg + | T.Previous prefix_arg + | T.Sum prefix_arg -> prefix_arg ~top:false :: args + in + + f_name ^ "(" ^ String.concat ~sep:", " args ^ ")" + + let nvl : 'a repr list -> 'a repr = + fun elems ~top -> + ignore top; + let args = arguments ~top:false elems in + "nvl" ^ args + + let join : string -> 'a repr list -> 'a repr = + fun sep elems ~top -> + ignore top; + let header = literal sep in + let args = arguments ~top:false (header :: elems) in + "join" ^ args + + let boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr = + fun op arg1 arg2 ~top -> + ignore top; + let top = false in + let sep = T.name_of_operator op in + String.concat ~sep [ arg1 ~top; arg2 ~top ] + + let gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr = + fun op arg1 arg2 ~top -> + ignore top; + let top = false in + let sep = T.name_of_operator op in + let args = group ~top:false arg2 in + String.concat ~sep [ arg1 ~top; args ] + + let funct : string -> 'a repr list -> 'a repr = + fun f args ~top -> + ignore top; + let args = arguments ~top:false args in + f ^ args + + let function' : T.funct -> 'a repr list -> 'a repr = + fun f args ~top -> + ignore top; + let args = arguments ~top:false args in + T.name_of_function f ^ args +end + +module M = Sym.M (E) + +let repr : ?top:bool -> ('a -> string) -> 'a T.t -> string = + fun ?(top = false) printer expr -> + let repr = M.eval ~path_repr:printer expr in + E.observe repr ~top diff --git a/lib/expression/repr.mli b/lib/expression/repr.mli new file mode 100644 index 0000000..4431655 --- /dev/null +++ b/lib/expression/repr.mli @@ -0,0 +1,6 @@ +val repr : ?top:bool -> ('a -> string) -> 'a T.t -> string + +module E : + Sym.SYM_EXPR + with type 'a obs = top:bool -> string + and type 'a path_repr = 'a -> string diff --git a/lib/expression/sym.ml b/lib/expression/sym.ml new file mode 100644 index 0000000..0360e8e --- /dev/null +++ b/lib/expression/sym.ml @@ -0,0 +1,71 @@ +(** The signature for an expression analyzer. + + Every element is mapped to a function, using the tagless final pattern. + + *) +module type SYM_EXPR = sig + type 'a repr + type 'a obs + type 'a path_repr + + val empty : unit -> 'a repr + val expr : 'a repr -> 'a repr + val literal : string -> 'a repr + val integer : string -> 'a repr + val path : 'a path_repr -> 'a -> 'a repr + val concat : 'a repr list -> 'a repr + val window : 'a repr T.window -> 'a repr list -> 'a repr list -> 'a repr + val nvl : 'a repr list -> 'a repr + val join : string -> 'a repr list -> 'a repr + val boperator : T.binary_operator -> 'a repr -> 'a repr -> 'a repr + val gequality : T.binary_operator -> 'a repr -> 'a repr list -> 'a repr + val funct : string -> 'a repr list -> 'a repr + val function' : T.funct -> 'a repr list -> 'a repr + val observe : 'a repr -> 'a obs +end + +module type SYM_CHUNK = sig + include SYM_EXPR + + val group : 'a repr list -> 'a repr + val arguments : 'a repr list -> 'a repr +end + +open StdLabels + +module M (Expr : SYM_EXPR) = struct + let rec eval : path_repr:'a Expr.path_repr -> 'a T.t -> 'c Expr.repr = + fun ~path_repr t -> + match t with + | T.Expr expr -> Expr.expr (eval ~path_repr expr) + | T.Empty -> Expr.empty () + | T.Literal s -> Expr.literal s + | T.Integer i -> Expr.integer i + | T.Concat elems -> Expr.concat (List.map elems ~f:(eval ~path_repr)) + | T.Function (name, args) -> + Expr.funct name (List.map args ~f:(eval ~path_repr)) + | T.Function' (name, args) -> + Expr.function' name (List.map args ~f:(eval ~path_repr)) + | T.Nvl elems -> Expr.nvl (List.map elems ~f:(eval ~path_repr)) + | T.Join (sep, args) -> Expr.join sep (List.map args ~f:(eval ~path_repr)) + | T.Window (name, group, sort) -> + Expr.window + (eval_window ~path_repr name) + (List.map group ~f:(eval ~path_repr)) + (List.map sort ~f:(eval ~path_repr)) + | T.BOperator (op, arg1, arg2) -> + Expr.boperator op (eval ~path_repr arg1) (eval ~path_repr arg2) + | T.GEquality (op, arg1, arg2) -> + Expr.gequality op (eval ~path_repr arg1) + (List.map arg2 ~f:(eval ~path_repr)) + | T.Path p -> Expr.path path_repr p + + and eval_window : + path_repr:'a Expr.path_repr -> 'a T.t T.window -> 'a Expr.repr T.window = + fun ~path_repr -> function + | Min a -> Min (eval ~path_repr a) + | Max a -> Max (eval ~path_repr a) + | Counter -> Counter + | Previous a -> Previous (eval ~path_repr a) + | Sum a -> Sum (eval ~path_repr a) +end diff --git a/lib/expression/t.ml b/lib/expression/t.ml new file mode 100644 index 0000000..7e61317 --- /dev/null +++ b/lib/expression/t.ml @@ -0,0 +1,153 @@ +open StdLabels + +type 'a window = + | Min of 'a + | Max of 'a + | Counter + | Previous of 'a + | Sum of 'a + +type 'a t = + | Empty + | Expr of 'a t + | Literal of string + | Integer of string + | Path of 'a + | Concat of 'a t list + | Function of string * 'a t list + | Nvl of 'a t list + | Join of string * 'a t list + | Window of ('a t window * 'a t list * 'a t list) + | BOperator of binary_operator * 'a t * 'a t + | GEquality of binary_operator * 'a t * 'a t list + | Function' of funct * 'a t list + +and binary_operator = + | Equal + | Different + | Add + | Minus + | Division + | LT + | GT + | And + | Or + +and funct = + | Upper + | Trim + +let name_of_function = function + | Upper -> "UPPER" + | Trim -> "TRIM" + +let name_of_operator = function + | Equal -> "=" + | Different -> "<>" + | Add -> "+" + | Minus -> "-" + | Division -> "/" + | LT -> "<" + | GT -> ">" + | And -> " and " + | Or -> " or " + +let name_of_window = function + | Min _ -> "min" + | Max _ -> "max" + | Counter -> "counter" + | Previous _ -> "previous" + | Sum _ -> "sum" + +let map_window : f:('a -> 'b) -> 'a window -> 'b window = + fun ~f -> function + | Min t -> Min (f t) + | Max t -> Max (f t) + | Counter -> Counter + | Previous t -> Previous (f t) + | Sum t -> Sum (f t) + +(** Extract the kind of the window function from the given name. *) +let window_of_name name opt = + match (name, opt) with + | "min", Some p -> Min p + | "max", Some p -> Max p + | "counter", None -> Counter + | "previous", Some p -> Previous p + | "sum", Some p -> Sum p + | _other -> raise Not_found + +let rec cmp : ('a -> 'a -> int) -> 'a t -> 'a t -> int = + fun f e1 e2 -> + match (e1, e2) with + | Empty, Empty -> 0 + | Literal l1, Literal l2 -> String.compare l1 l2 + | Integer l1, Integer l2 -> String.compare l1 l2 + | Path p1, Path p2 -> f p1 p2 + | Concat elems1, Concat elems2 | Nvl elems1, Nvl elems2 -> + List.compare ~cmp:(cmp f) elems1 elems2 + | Function (n1, elems1), Function (n2, elems2) -> + let name_cmp = String.compare n1 n2 in + if name_cmp = 0 then List.compare ~cmp:(cmp f) elems1 elems2 else name_cmp + | Window (s1, l11, l12), Window (s2, l21, l22) -> ( + match compare s1 s2 with + | 0 -> + let l1_cmp = List.compare ~cmp:(cmp f) l11 l21 in + if l1_cmp = 0 then List.compare ~cmp:(cmp f) l12 l22 else l1_cmp + | other -> other) + | BOperator (n1, arg11, arg12), BOperator (n2, arg21, arg22) -> begin + match compare n1 n2 with + | 0 -> begin + match cmp f arg11 arg21 with + | 0 -> cmp f arg12 arg22 + | other -> other + end + | other -> other + end + (* Any other case *) + | other1, other2 -> Stdlib.compare other1 other2 + +let fold_values : f:('b -> 'a -> 'b) -> init:'b -> 'a t -> 'b = + fun ~f ~init expression -> + let rec _f acc = function + | Empty | Literal _ | Integer _ -> acc + | Expr e -> _f acc e + | Path p -> f acc p + | Concat pp | Function' (_, pp) | Function (_, pp) | Nvl pp | Join (_, pp) + -> List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc pp + | Window (window_f, pp1, pp2) -> + (* Each window function can have a distinct parameter first. *) + let acc' = + match window_f with + | Counter -> acc + | Min key | Max key | Previous key | Sum key -> _f acc key + in + let eval1 = List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc' pp1 in + List.fold_left ~f:(fun acc a -> _f acc a) ~init:eval1 pp2 + | BOperator (_, arg1, arg2) -> _f (_f acc arg1) arg2 + | GEquality (_, arg1, arg2) -> + let eval1 = List.fold_left ~f:(fun acc a -> _f acc a) ~init:acc arg2 in + _f eval1 arg1 + in + _f init expression + +let map : type a b. f:(a -> b) -> a t -> b t = + fun ~f expression -> + let rec map = function + | Expr e -> Expr (map e) + | Empty -> Empty + | Literal s -> Literal s + | Integer i -> Integer i + | Path p -> Path (f p) + | Concat pp -> Concat (List.map ~f:map pp) + | Function' (name, pp) -> Function' (name, List.map ~f:map pp) + | Function (name, pp) -> Function (name, List.map ~f:map pp) + | Nvl pp -> Nvl (List.map ~f:map pp) + | Join (sep, pp) -> Join (sep, List.map ~f:map pp) + | Window (window_f, pp1, pp2) -> + let w = map_window ~f:map window_f in + Window (w, List.map ~f:map pp1, List.map ~f:map pp2) + | BOperator (n, arg1, arg2) -> BOperator (n, map arg1, map arg2) + | GEquality (n, arg1, args) -> GEquality (n, map arg1, List.map ~f:map args) + in + map expression diff --git a/lib/expression/t.mli b/lib/expression/t.mli new file mode 100644 index 0000000..840805d --- /dev/null +++ b/lib/expression/t.mli @@ -0,0 +1,54 @@ +type 'a window = + | Min of 'a + | Max of 'a + | Counter + | Previous of 'a + | Sum of 'a + +type 'a t = + | Empty + | Expr of 'a t + | Literal of string + | Integer of string + | Path of 'a + | Concat of 'a t list + | Function of string * 'a t list + | Nvl of 'a t list + | Join of string * 'a t list + | Window of ('a t window * 'a t list * 'a t list) + | BOperator of binary_operator * 'a t * 'a t + | GEquality of binary_operator * 'a t * 'a t list + | Function' of funct * 'a t list + +and binary_operator = + | Equal + | Different + | Add + | Minus + | Division + | LT + | GT + | And + | Or + +and funct = + | Upper + | Trim + +val cmp : ('a -> 'a -> int) -> 'a t -> 'a t -> int +(** Compare two expressions *) + +val fold_values : f:('b -> 'a -> 'b) -> init:'b -> 'a t -> 'b +(** Fold over all the path presents inside the expression. Used for example to + identify all the columns to extract from the file. + + The order is not guarantee to follow the order from the expression *) + +val map : f:('a -> 'b) -> 'a t -> 'b t +(** The map function. Mainly used in the configuration migration. *) + +val name_of_operator : binary_operator -> string +val name_of_window : 'a window -> string +val map_window : f:('a -> 'b) -> 'a window -> 'b window +val window_of_name : string -> 'a option -> 'a window +val name_of_function : funct -> string diff --git a/lib/expression/type_of.ml b/lib/expression/type_of.ml new file mode 100644 index 0000000..ce1a17e --- /dev/null +++ b/lib/expression/type_of.ml @@ -0,0 +1,150 @@ +(** + This module evaluate the type of an expression. + + The type is given with an analysis from all the component involved inside + the exrpssion. It is used inside the [query] module in order to check if one + type need conversion before being used. + *) + +open StdLabels + +module Lazy_Repr = + Compose.Expression + (Lazier.Make + (Repr.E)) + (struct + let v _ = "" + end) + +type t = ImportDataTypes.Types.t + +(** Fold over the list of parameters and ensure all the elements are typed in +the same way *) +let group' : t list -> t = + fun elements -> + List.fold_left elements ~init:None + ~f:(fun (acc : ImportDataTypes.Types.t option) v -> + match acc with + | None -> Some v + | Some t when t = v -> acc + | _ -> Some Extern) + |> Option.value ~default:ImportDataTypes.Types.None + +include Lazy_Repr.Make (struct + type nonrec t = t + type 'a repr = t + type 'a obs = ImportDataTypes.Types.t + type 'a path_repr = 'a -> unit + + let observe : 'a Repr.E.obs Lazy.t * 'a repr -> 'a obs = snd + + let empty : 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ -> ImportDataTypes.Types.None + + let expr : 'a Repr.E.obs Lazy.t * 'a repr -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun e _ -> snd e + + let literal : string -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.String + + let integer : string -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.Number + + let path : 'b path_repr -> 'b -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ _ -> ImportDataTypes.Types.Extern + + let concat : + ('a Repr.E.obs Lazy.t * 'a repr) list -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun _ _ -> ImportDataTypes.Types.String + + let window : + ('a Repr.E.obs Lazy.t * 'a repr) T.window -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions order _ -> + ignore order; + ignore expressions; + match name with + | T.Counter | T.Max _ | T.Min _ | T.Sum _ -> Number + | T.Previous expr -> snd expr + + let nvl : + ('a Repr.E.obs Lazy.t * 'a repr) list -> 'a Repr.E.obs Lazy.t -> 'a repr = + fun v _ -> group' (List.map ~f:snd v) + + let join : + string -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun _ _ _ -> ImportDataTypes.Types.String + + let boperator : + T.binary_operator -> + 'a Repr.E.obs Lazy.t * 'a repr -> + 'a Repr.E.obs Lazy.t * 'a repr -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name _ _ _ -> + match name with + | T.Equal | T.Different | T.LT | T.GT -> Bool + | T.Add | T.Minus -> Number + | T.Division -> Float + | T.And | T.Or -> Bool + + let gequality : + T.binary_operator -> + 'a Repr.E.obs Lazy.t * 'a repr -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name _ _ _ -> + match name with + | T.Equal | T.Different -> Bool + | _ -> None + + let function' : + T.funct -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions _ -> + ignore expressions; + match name with + | Upper | Trim -> String + + let check : expected:t -> actual:t -> string -> 'a Repr.E.obs Lazy.t -> t = + fun ~expected ~actual subset expr -> + if actual = expected then actual + else + let expression = (Lazy.force expr) ~top:false in + raise (ImportErrors.TypeError { expression; subset; expected; actual }) + + let funct : + string -> + ('a Repr.E.obs Lazy.t * 'a repr) list -> + 'a Repr.E.obs Lazy.t -> + 'a repr = + fun name expressions repr -> + match name with + | "if" -> begin + match expressions with + | [] -> Extern + | (_, hd) :: arg1 :: _ when hd = Bool -> snd arg1 + | (_, hd) :: _ -> + let expected = ImportDataTypes.Types.Bool and actual = hd in + check ~expected ~actual "the predicate" repr + end + | _ -> Extern +end) + +let group : + ('a Lazier.Make(Repr.E).repr * t) list -> 'a Lazier.Make(Repr.E).repr * t = + fun v -> + let v' = group' (List.map v ~f:snd) in + let l = lazy (Repr.E.empty ()) in + (l, v') + +let arguments = group diff --git a/lib/expression/type_of.mli b/lib/expression/type_of.mli new file mode 100644 index 0000000..7a11582 --- /dev/null +++ b/lib/expression/type_of.mli @@ -0,0 +1,10 @@ +(** This module tries to identify the type of an expression. + +The references to data comming from the spreaedsheet cannot be evaluated and +marked as [Extern]. *) + +type t = ImportDataTypes.Types.t + +include Sym.SYM_CHUNK with type 'a obs = t and type 'a path_repr = 'a -> unit + +val group' : t list -> t diff --git a/lib/file_handler/csv2sql.ml b/lib/file_handler/csv2sql.ml new file mode 100644 index 0000000..42d84eb --- /dev/null +++ b/lib/file_handler/csv2sql.ml @@ -0,0 +1,135 @@ +open StdLabels +module A = ImportAnalyser.Dependency +module CSV = ImportCSV +module C = ImportContainers +module Syntax = ImportConf.Syntax +module Db = ImportSQL.Db + +type state = CSV.DataType.t array State.t + +let default_mapper : + (ImportCSV.DataType.t, ImportCSV.DataType.t array) State.mapper = + { get_row = Fun.id; get_value = Fun.id; default = ImportCSV.DataType.Null } + +let extract_values : string -> CSV.DataType.t = + fun value -> + (* Test first if the content is empty *) + if String.equal String.empty value then CSV.DataType.Null + else + (* else, try differents conversion in order to see which one works *) + match int_of_string_opt value with + | Some i -> CSV.DataType.Integer i + | None -> ( + match float_of_string_opt value with + | Some f -> CSV.DataType.Float f + | None -> + (* And finaly convert into date *) + CSV.DataType.Content value) + +(** Initialize the state for the first row, count the column number and create + the table in the database *) +let first_row : A.t -> _ Db.t -> state -> CSV.DataType.t list -> state = + fun mapping db acc row -> + (if acc.transaction then + match Db.commit db with + | Ok () -> () + | Error e -> print_endline (ImportErrors.repr_error e)); + + ignore @@ Db.create_table db mapping; + let row = Array.of_list row in + match Db.prepare_insert db mapping with + | Ok stmt -> + { + acc with + header = Some row; + transaction = false; + insert_stmt = Some stmt; + row_number = acc.row_number + 1; + } + | _ -> { acc with header = Some row; transaction = false; insert_stmt = None } + +let read_csv_line : + log_error:ImportErrors.t -> A.t -> 'a Db.t -> state -> string list -> state + = + fun ~log_error mapping db acc row -> + let processed_row = + List.to_seq row |> Seq.map extract_values |> Array.of_seq + in + if acc.State.transaction then + State.run_row ~log_error ~mapper:default_mapper mapping db processed_row acc + else + match Db.begin_transaction db with + | Error e -> + print_endline (ImportErrors.repr_error e); + acc + | Ok () -> + let acc = { acc with transaction = true } in + State.run_row ~log_error ~mapper:default_mapper mapping db processed_row + acc + +let importInDatable : + log_error:ImportErrors.t -> + conf:Syntax.t -> + dirname:string -> + A.t -> + 'a Db.t -> + CSV.DataType.t array option Lwt.t = + fun ~log_error ~conf ~dirname mapping db -> + let file = Filename.concat dirname (A.table mapping).file in + + let channel = Stdlib.open_in_bin file in + + let csv_channel = Csv.of_channel ~separator:';' ~excel_tricks:true channel in + + (* In the headers, we only keep the string. + + This line could generate an error if the headers are not correctly defined. + *) + let header = + List.map ~f:(fun v -> CSV.DataType.Content v) (Csv.next csv_channel) + in + + let state = + State. + { + transaction = false; + header = None; + insert_stmt = None; + check_key_stmt = None; + row_number = 1; + sheet_number = 1; + delayed = []; + } + in + let state = first_row mapping db state header in + + let state = + try + Csv.fold_left csv_channel ~init:state + ~f:(read_csv_line ~log_error mapping db) + with + | Csv.Failure (line, row, cause) as e -> + Printf.eprintf "Error %s on line %d — field : %s\n" cause line + (ImportCSV.Csv.column_to_string row); + raise e + in + ignore @@ State.clear ~log_error db mapping conf; + ignore @@ Db.commit db; + + (* Finalize the statements created during the import *) + let () = + Option.iter (fun v -> ignore @@ Db.finalize v) state.insert_stmt; + Option.iter (fun v -> ignore @@ Db.finalize v) state.check_key_stmt + in + + (* Insert all the headers *) + let _ = + Option.iter + (fun headers -> + let values = Array.mapi headers ~f:(fun i value -> (i, value)) in + + ignore + @@ Db.insert_header db (ImportAnalyser.Dependency.table mapping) values) + state.header + in + Lwt.return state.header diff --git a/lib/file_handler/csv2sql.mli b/lib/file_handler/csv2sql.mli new file mode 100644 index 0000000..e09737b --- /dev/null +++ b/lib/file_handler/csv2sql.mli @@ -0,0 +1,10 @@ +val importInDatable : + log_error:ImportErrors.t -> + conf:ImportConf.Syntax.t -> + dirname:string -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + ImportCSV.DataType.t array option Lwt.t +(** Load an excel spreadsheet in an SQLite database. + +Return the header if at least one row where present *) diff --git a/lib/file_handler/dune b/lib/file_handler/dune new file mode 100755 index 0000000..6b247db --- /dev/null +++ b/lib/file_handler/dune @@ -0,0 +1,21 @@ +(library + (name importFileHandler) + (libraries + csv + SZXX + sqlite3 + base + core + lwt + lwt.unix + helpers + importAnalyser + importConf + importContainers + importCSV + importDataTypes + importErrors + importExpression + importSQL + ) +) diff --git a/lib/file_handler/state.ml b/lib/file_handler/state.ml new file mode 100644 index 0000000..5b43aff --- /dev/null +++ b/lib/file_handler/state.ml @@ -0,0 +1,178 @@ +open StdLabels +module Table = ImportDataTypes.Table + +type 'a t = { + header : 'a option; + transaction : bool; + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; + row_number : int; + sheet_number : int; + delayed : 'a list; +} + +type insert_result = { + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; +} + +type ('a, 'b) mapper = { + get_row : 'b -> 'a Array.t; + get_value : 'a -> ImportCSV.DataType.t; + default : 'a; +} + +module A = ImportAnalyser.Dependency + +let insert_row : + mapper:(_, 'row) mapper -> + A.t -> + _ ImportSQL.Db.t -> + 'row -> + _ t -> + (insert_result, ImportErrors.xlsError) result = + fun ~mapper mapping db row state -> + (* Extract all columns referenced in the keys or the columns to extract *) + let keys_id = + List.fold_left (A.keys mapping) ~init:ImportContainers.IntSet.empty + ~f:(fun acc (keys : A.key) -> + let columns = keys.A.columns in + ImportContainers.IntSet.union acc (Lazy.force columns)) + and columns_id = A.columns mapping in + let ids = ImportContainers.IntSet.(union keys_id columns_id |> elements) in + + (* Filter only the required columns in the row *) + let values = + List.map ids ~f:(fun i -> + let index = i - 1 in + let value = + try Array.get (mapper.get_row row) index with + | Stdlib.Invalid_argument _ -> + (* If we have more headers than data, assume the value are NULL. + This can happen when all the line tail is empty, Excel can + give us a truncated line instead of a series of NULL *) + mapper.default + in + (index, mapper.get_value value)) + in + let keys = A.keys mapping in + + let execution = + let ( let* ) = Result.bind in + let* check_key_stmt, result = + ImportSQL.Db.eval_key db state.check_key_stmt keys values + in + let no_null = + (* We check if we have at least one key which is not null — and in such + case we ignore the line. + + If multiple keys are presents, we ensure there is at least one non + null here. + *) + match result with + | [] -> true + | _ -> + List.exists result ~f:(function + | Sqlite3.Data.FLOAT _ | Sqlite3.Data.INT _ -> true + | Sqlite3.Data.BLOB t | Sqlite3.Data.TEXT t -> + not (String.equal "" t) + | Sqlite3.Data.NONE | Sqlite3.Data.NULL -> false) + in + let* _ = + match no_null with + | true -> Ok () + | false -> Error (Failure "The key is null") + in + + let* statement = + match state.insert_stmt with + | None -> ImportSQL.Db.prepare_insert db mapping + | Some v -> Ok v + in + let* _ = ImportSQL.Db.insert db statement ~id:state.row_number values in + let* _ = ImportSQL.Db.reset statement in + + Helpers.Console.update_cursor (); + Ok { insert_stmt = Some statement; check_key_stmt } + in + + (* In case of error, wrap the exception to get the line *) + Result.map_error + (fun e -> + ImportErrors. + { + source = ImportAnalyser.Dependency.table mapping; + sheet = state.sheet_number; + row = state.row_number; + target = None; + value = CSV.DataType.Content (String.concat ~sep:", " []); + exn = e; + }) + execution + +(** Load the row with all the informations associated with this sheet. + + If an error has already been raised during the sheet, ignore this row only. *) +let run_row : + log_error:ImportErrors.t -> + mapper:(_, 'row) mapper -> + A.t -> + _ ImportSQL.Db.t -> + 'row -> + 'a t -> + 'a t = + fun ~log_error ~mapper mapping db row state -> + match insert_row ~mapper mapping db row state with + | Ok { insert_stmt; check_key_stmt } -> + { + state with + insert_stmt; + check_key_stmt; + row_number = state.row_number + 1; + } + | Error e -> + Option.iter (fun v -> ignore @@ ImportSQL.Db.finalize v) state.insert_stmt; + Option.iter + (fun v -> ignore @@ ImportSQL.Db.finalize v) + state.check_key_stmt; + ImportErrors.output_error log_error e; + { + state with + insert_stmt = None; + check_key_stmt = None; + row_number = state.row_number + 1; + } + +let clear : + log_error:ImportErrors.t -> + 'a ImportSQL.Db.t -> + A.t -> + ImportConf.Syntax.t -> + unit ImportSQL.Db.result = + fun ~log_error db mapping conf -> + ImportSQL.Db.clear_duplicates db (A.table mapping) (A.keys mapping) + ~f:(fun values -> + let line = + match snd @@ Array.get values 0 with + | ImportCSV.DataType.Integer i -> i + | _ -> -1 + and value = snd @@ Array.get values 1 + and target = + match snd @@ Array.get values 2 with + | ImportCSV.DataType.Content s -> + Some (ImportConf.get_table_for_name conf (Some s)) + | _ -> None + in + let error = + ImportErrors. + { + source = A.table mapping; + sheet = (A.table mapping).tab; + row = line; + target; + value; + exn = Failure "Duplicated key"; + } + in + + ImportErrors.output_error log_error error) diff --git a/lib/file_handler/state.mli b/lib/file_handler/state.mli new file mode 100644 index 0000000..f744c33 --- /dev/null +++ b/lib/file_handler/state.mli @@ -0,0 +1,46 @@ +type 'a t = { + header : 'a option; + transaction : bool; + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; + row_number : int; + sheet_number : int; + delayed : 'a list; +} + +type insert_result = { + insert_stmt : Sqlite3.stmt option; + check_key_stmt : Sqlite3.stmt option; +} + +type ('a, 'b) mapper = { + get_row : 'b -> 'a Array.t; + get_value : 'a -> ImportCSV.DataType.t; + default : 'a; +} + +val insert_row : + mapper:(_, 'row) mapper -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + 'row -> + _ t -> + (insert_result, ImportErrors.xlsError) result +(** Low level row insertion *) + +val run_row : + log_error:ImportErrors.t -> + mapper:(_, 'row) mapper -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + 'row -> + 'a t -> + 'a t + +val clear : + log_error:ImportErrors.t -> + 'a ImportSQL.Db.t -> + ImportAnalyser.Dependency.t -> + ImportConf.Syntax.t -> + unit ImportSQL.Db.result +(** Clean up the table after the insertion, check for the duplicates and external references *) diff --git a/lib/file_handler/xlsx2sql.ml b/lib/file_handler/xlsx2sql.ml new file mode 100644 index 0000000..f2d8f12 --- /dev/null +++ b/lib/file_handler/xlsx2sql.ml @@ -0,0 +1,205 @@ +open StdLabels +module A = ImportAnalyser.Dependency +module C = ImportContainers +module CSV = ImportCSV +module Syntax = ImportConf.Syntax +module Db = ImportSQL.Db + +let flags = Unix.[ O_RDONLY; O_NONBLOCK ] + +let extractors = + SZXX.Xlsx. + { + string = (fun _location s -> CSV.DataType.Content s); + error = + (fun _location s -> CSV.DataType.Error (Printf.sprintf "#ERROR# %s" s)); + boolean = + (fun _location s -> + let value = String.(equal s "1") in + CSV.DataType.Content (string_of_bool value)); + number = + (fun _location s -> + let f = Float.of_string s in + if Float.is_integer f then CSV.DataType.Integer (Float.to_int f) + else CSV.DataType.Float f); + date = (fun _location s -> CSV.DataType.Content s); + null = CSV.DataType.Null; + formula = + (fun _location ~formula s -> + ignore formula; + CSV.DataType.Content s); + } + +let feed_bigstring ic = + let open Lwt.Infix in + let len = Lwt_io.buffer_size ic in + let buf = Lwt_bytes.create len in + SZXX.Zip.Bigstring + (fun () -> + Lwt_io.read_into_bigstring ic buf 0 len >|= function + | 0 -> None + | len -> Some SZXX.Zip.{ buf; pos = 0; len }) + +(* Evaluate if the row can be processed right now (does not contain + any delayed value) *) +let is_delayed row = + Array.exists row.SZXX.Xlsx.data ~f:(function + | SZXX.Xlsx.Delayed _ -> true + | _ -> false) + +let default_mapper : + (ImportCSV.DataType.t, ImportCSV.DataType.t SZXX.Xlsx.row) State.mapper = + { + get_value = + (function + | ImportCSV.DataType.Content s -> + ImportCSV.DataType.Content (SZXX.Xml.unescape s) + | any -> any); + default = ImportCSV.DataType.Null; + get_row = (fun v -> v.SZXX.Xlsx.data); + } + +type state = CSV.DataType.t SZXX.Xlsx.status SZXX.Xlsx.row State.t + +let delayed_mapper = + State. + { + get_value = + (function + | SZXX.Xlsx.Available (CSV.DataType.Content s) -> + CSV.DataType.Content (SZXX.Xml.unescape s) + | SZXX.Xlsx.Available value -> value + | _ -> CSV.DataType.Null); + default = SZXX.Xlsx.Available CSV.DataType.Null; + get_row = (fun v -> v.SZXX.Xlsx.data); + } + +(** Initialize the state for the first row, count the column number and create + the table in the database *) +let first_row : A.t -> _ Db.t -> state -> 'a SZXX.Xlsx.row -> state = + fun mapping db acc row -> + (if acc.transaction then + match Db.commit db with + | Ok () -> () + | Error e -> print_endline (ImportErrors.repr_error e)); + + ignore @@ Db.create_table db mapping; + match Db.prepare_insert db mapping with + | Ok stmt -> + { + acc with + header = Some row; + transaction = false; + insert_stmt = Some stmt; + } + | _ -> { acc with header = Some row; transaction = false; insert_stmt = None } + +let importInDatable : + log_error:Csv.out_channel Lazy.t -> + conf:Syntax.t -> + dirname:string -> + A.t -> + 'a Db.t -> + CSV.DataType.t array option Lwt.t = + fun ~log_error ~conf ~dirname mapping db -> + let file = Filename.concat dirname (A.table mapping).file in + + Lwt_io.with_file ~flags ~mode:Input file (fun ic -> + let open Lwt.Syntax in + let stream, sst_p, success = + SZXX.Xlsx.stream_rows ~only_sheet:(A.table mapping).tab + ~feed:(feed_bigstring ic) extractors + in + let* processed = + Lwt_stream.fold + (fun row acc -> + (* Create the table on the first line *) + if Int.equal 1 row.SZXX.Xlsx.row_number then + first_row mapping db acc row + else + match is_delayed row with + | true -> { acc with delayed = row :: acc.delayed } + | false -> ( + let row_number = row.SZXX.Xlsx.row_number in + if acc.transaction then + State.run_row ~log_error ~mapper:delayed_mapper mapping db + row { acc with row_number } + else + match Db.begin_transaction db with + | Error e -> + print_endline (ImportErrors.repr_error e); + acc + | Ok () -> + let acc = { acc with transaction = true; row_number } in + State.run_row ~log_error ~mapper:delayed_mapper mapping + db row acc)) + stream + { + transaction = false; + header = None; + delayed = []; + insert_stmt = None; + check_key_stmt = None; + row_number = 1; + sheet_number = (A.table mapping).tab; + } + in + (* Wait to reach the sst *) + let* sst = sst_p in + + if processed.transaction then ignore (Db.commit db); + + (* Insert the missing elements *) + ignore @@ Db.begin_transaction db; + List.iter processed.delayed ~f:(fun row -> + let fully_available_row = + SZXX.Xlsx.unwrap_status extractors sst row + in + + let row_number = row.SZXX.Xlsx.row_number in + + match + State.insert_row ~mapper:default_mapper mapping db + fully_available_row + { processed with row_number } + with + | Ok _ -> () + | Error e -> + ImportErrors.output_error log_error e; + ()); + + ignore @@ State.clear ~log_error db mapping conf; + ignore @@ Db.commit db; + + (* Finalize the statements created during the import *) + let () = + Option.iter (fun v -> ignore @@ Db.finalize v) processed.insert_stmt; + Option.iter (fun v -> ignore @@ Db.finalize v) processed.check_key_stmt + in + + let _ = + Option.iter + (fun headers -> + let res = SZXX.Xlsx.unwrap_status extractors sst headers in + + let values = Array.mapi res.data ~f:(fun i value -> (i, value)) in + + ignore + @@ Db.insert_header db + (ImportAnalyser.Dependency.table mapping) + values) + processed.header + in + + let header = + Option.map + (fun header -> + let res = SZXX.Xlsx.unwrap_status extractors sst header in + res.data) + processed.header + in + + (* Finalize the process *) + let* () = success in + + Lwt.return header) diff --git a/lib/file_handler/xlsx2sql.mli b/lib/file_handler/xlsx2sql.mli new file mode 100644 index 0000000..e09737b --- /dev/null +++ b/lib/file_handler/xlsx2sql.mli @@ -0,0 +1,10 @@ +val importInDatable : + log_error:ImportErrors.t -> + conf:ImportConf.Syntax.t -> + dirname:string -> + ImportAnalyser.Dependency.t -> + _ ImportSQL.Db.t -> + ImportCSV.DataType.t array option Lwt.t +(** Load an excel spreadsheet in an SQLite database. + +Return the header if at least one row where present *) diff --git a/lib/helpers/console.ml b/lib/helpers/console.ml new file mode 100644 index 0000000..838b25a --- /dev/null +++ b/lib/helpers/console.ml @@ -0,0 +1,16 @@ +let cursors = [| '|'; '/'; '-'; '\\' |] +let pos = ref 0 + +let update_cursor () = + if Unix.(isatty stdout) then ( + Printf.printf "%c[?25l%c[1D%c[0K%c" (char_of_int 27) (char_of_int 27) + (char_of_int 27) (Array.get cursors !pos); + pos := (!pos + 1) mod Array.length cursors) + +let close_cursor () = + if Unix.(isatty stdout) then + Printf.printf "%c[?25h%c[1D%c[0K\n%!" (char_of_int 27) (char_of_int 27) + (char_of_int 27) + +let clear_line () = + if Unix.(isatty stdout) then Printf.printf "%c[2K\r%!" (char_of_int 27) diff --git a/lib/helpers/console.mli b/lib/helpers/console.mli new file mode 100644 index 0000000..289d55c --- /dev/null +++ b/lib/helpers/console.mli @@ -0,0 +1,5 @@ +val update_cursor : unit -> unit +val close_cursor : unit -> unit + +val clear_line : unit -> unit +(** Clear the entire line *) diff --git a/lib/helpers/dune b/lib/helpers/dune new file mode 100755 index 0000000..8e30d2b --- /dev/null +++ b/lib/helpers/dune @@ -0,0 +1,8 @@ +(library + (name helpers) + (libraries + calendar + decoders + otoml + ) +) diff --git a/lib/helpers/helpers.ml b/lib/helpers/helpers.ml new file mode 100755 index 0000000..9d6fcb8 --- /dev/null +++ b/lib/helpers/helpers.ml @@ -0,0 +1,45 @@ +module Toml = Toml +module Console = Console + +let date_from_csv : string -> CalendarLib.Date.t option = + fun value -> + let open CalendarLib.Date in + try Some (Scanf.sscanf value "%d/%d/%d" (fun d m y -> make y m d)) with + | _ -> ( + (* If the date is a number, try from julian day *) + match int_of_string_opt value with + | None -> None + | Some v -> Some (add (make 1899 12 30) (Period.day v))) + +let fold_opt : ('a -> 'b -> 'a option) -> 'a -> 'b -> 'a = + fun f acc b -> + match f acc b with + | None -> acc + | Some v -> v + +let try_opt exp = + try Some (exp ()) with + | _ -> None + +let repr_date formatter date = + Format.fprintf formatter "%02d/%02d/%d" + (CalendarLib.Date.day_of_month date) + CalendarLib.Date.(int_of_month @@ month date) + (CalendarLib.Date.year date) + +let s_repr_date date = + Format.sprintf "%02d/%02d/%d" + (CalendarLib.Date.day_of_month date) + CalendarLib.Date.(int_of_month @@ month date) + (CalendarLib.Date.year date) + +let repr_opt f channel = function + | None -> () + | Some v -> f channel v + +let str_format f = + let buffer = Buffer.create 16 in + let formatter = Format.formatter_of_buffer buffer in + f formatter; + Format.pp_print_flush formatter (); + Buffer.contents buffer diff --git a/lib/helpers/toml.ml b/lib/helpers/toml.ml new file mode 100644 index 0000000..1b7fb15 --- /dev/null +++ b/lib/helpers/toml.ml @@ -0,0 +1,31 @@ +module Decode = struct + module S = struct + type value = Otoml.t + + let pp : Format.formatter -> value -> unit = + fun format t -> Format.pp_print_string format (Otoml.Printer.to_string t) + + let of_string : string -> (value, string) result = + Otoml.Parser.from_string_result + + let of_file : string -> (value, string) result = + Otoml.Parser.from_file_result + + let get_string : value -> string option = Otoml.get_opt Otoml.get_string + let get_int : value -> int option = Otoml.get_opt Otoml.get_integer + let get_float : value -> float option = Otoml.get_opt Otoml.get_float + let get_bool : value -> bool option = Otoml.get_opt Otoml.get_boolean + let get_null : value -> unit option = fun _ -> None + + let get_list : value -> value list option = + Otoml.get_opt @@ Otoml.get_array Fun.id + + let get_key_value_pairs : value -> (value * value) list option = + Otoml.get_opt (fun key -> + Otoml.get_table key |> List.map (fun (k, v) -> (Otoml.string k, v))) + + let to_list : value list -> value = Otoml.array + end + + include Decoders.Decode.Make (S) +end diff --git a/lib/helpers/toml.mli b/lib/helpers/toml.mli new file mode 100644 index 0000000..08d30b8 --- /dev/null +++ b/lib/helpers/toml.mli @@ -0,0 +1 @@ +module Decode : Decoders.Decode.S with type value = Otoml.t diff --git a/lib/sql/date.ml b/lib/sql/date.ml new file mode 100644 index 0000000..e8933c7 --- /dev/null +++ b/lib/sql/date.ml @@ -0,0 +1,24 @@ +(** Parse a text value into a date *) + +let first_day = CalendarLib.Date.make 1899 12 30 + +let f : Sqlite3.Data.t -> Sqlite3.Data.t -> Sqlite3.Data.t = + fun str data -> + match (str, data) with + | Sqlite3.Data.TEXT format_, Sqlite3.Data.TEXT content -> ( + try + let date = CalendarLib.Printer.Date.from_fstring format_ content in + let period = + CalendarLib.Date.sub date first_day + |> CalendarLib.Date.Period.nb_days |> Int64.of_int + in + Sqlite3.Data.INT period + with + | Invalid_argument e -> + prerr_endline e; + Sqlite3.Data.NULL) + | _ -> + (* If the data is already a date, it should be preserved as is *) + data + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun2 db "date" f diff --git a/lib/sql/db.ml b/lib/sql/db.ml new file mode 100644 index 0000000..89431b1 --- /dev/null +++ b/lib/sql/db.ml @@ -0,0 +1,383 @@ +open StdLabels +module CSV = ImportCSV +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path + +type 'a t = 'a T.t + +let ( let* ) res cont = Result.bind (T.to_result res) cont +let ( let** ) res cont = Result.bind res cont +let begin_transaction = T.begin_transaction +let rollback = T.rollback +let commit = T.commit +let finalize = T.finalize +let reset = T.reset +let insert_header = Header.insert_header +let query_headers = Header.query_headers + +let with_db : string -> (Sqlite3.db -> unit T.result) -> unit T.result = + fun filename f -> + let db = Sqlite3.db_open filename in + + Match.register db; + Date.register db; + Year.register db; + Join.register db; + Math.register db; + Trim.register db; + + (*let* _ = Sqlite3.exec db "PRAGMA foreign_keys = ON" |> to_result in*) + Sqlite3.( let& ) db f + +let create_table : 'a t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let name = Table.name source in + + let* _ = + Sqlite3.exec db + (String.concat ~sep:"" [ "DROP TABLE IF EXISTS '"; name; "'" ]) + in + + let query = ImportAnalyser.Query.create_table table in + let* _ = Sqlite3.exec db query in + match Header.create_table db with + | Ok () -> Hashs.create_table db + | e -> e + +let update_hash : 'a t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db mapping -> + match Hashs.insert db mapping with + | Ok () -> Ok () + | Error _ -> + let _ = Hashs.create_table db in + Hashs.insert db mapping + +let check_table_schema : 'a t -> ImportAnalyser.Dependency.t -> bool T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let name = Table.name source in + let query = + String.concat ~sep:"" + [ "SELECT sql FROM sqlite_schema WHERE name = '"; name; "'" ] + in + let stmt = Sqlite3.prepare db query in + + let rc, result = + Sqlite3.fold stmt ~init:false ~f:(fun value row -> + if Array.length row <> 1 then value + else + match Sqlite3.Data.to_string (Array.get row 0) with + | Some s -> + let query = ImportAnalyser.Query.create_table table in + String.equal s query + | None -> value) + in + let* _ = rc in + + (* The schema is the same, now check the hash in case the indexes changed *) + let rc_hash = Hashs.query db source in + match rc_hash with + | Ok (Some i) -> + let hash = Hashs.evaluate table in + begin + if i == hash then Ok result + else + let _ = update_hash db table in + Ok false + end + | _ -> + let _ = update_hash db table in + Ok result + +let prepare_insert : + Sqlite3.db -> ImportAnalyser.Dependency.t -> Sqlite3.stmt T.result = + fun db mapping -> + (* Get the list of columns from the table configuration *) + let columns = + ImportAnalyser.Dependency.columns mapping + |> ImportContainers.IntSet.elements + in + let table_name = Table.name (ImportAnalyser.Dependency.table mapping) in + + let open Buffer in + let buff = create 20 and value_buff = create 10 and col_buff = create 10 in + + add_string col_buff "'id',"; + + (* Add the key name if present *) + List.iter (ImportAnalyser.Dependency.keys mapping) + ~f:(fun { ImportAnalyser.Dependency.name; _ } -> + add_string col_buff "'key_"; + + add_string col_buff name; + add_string col_buff "',"); + + add_string value_buff ":id,"; + + (* Add the key settings if presents *) + List.iter (ImportAnalyser.Dependency.keys mapping) ~f:(fun key -> + ImportAnalyser.Query.build_key_insert value_buff key; + add_string value_buff ","); + + List.iter columns ~f:(fun id -> + add_string col_buff "'col_"; + add_string col_buff (string_of_int id); + add_string col_buff "',"; + + let col_name = ":col_" ^ string_of_int id in + + add_string value_buff col_name; + add_string value_buff ","); + + truncate col_buff (length col_buff - 1); + truncate value_buff (length value_buff - 1); + add_string buff "INSERT INTO '"; + add_string buff table_name; + add_string buff "' ("; + add_buffer buff col_buff; + add_string buff " ) VALUES ("; + add_buffer buff value_buff; + add_string buff " )"; + + let query = contents buff in + + try Ok (Sqlite3.prepare db query) with + | e -> + print_endline "Error during this query :"; + print_endline query; + Error e + +let eval_key : + 'a t -> + Sqlite3.stmt option -> + ImportAnalyser.Dependency.key list -> + (int * CSV.DataType.t) list -> + (Sqlite3.stmt option * Sqlite3.Data.t list) T.result = + fun db stmt keys values -> + match keys with + | [] -> Ok (None, []) + | _ -> + let buffer = Buffer.create 16 in + Buffer.add_string buffer "SELECT "; + List.iter keys ~f:(fun key -> + ImportAnalyser.Query.build_key_insert buffer key; + Buffer.add_string buffer ","); + + Buffer.truncate buffer (Buffer.length buffer - 1); + let query = Buffer.contents buffer in + + let statement = Sqlite3.prepare_or_reset db (ref stmt) query in + + (* Extract all the column id used in the keys. + *) + let keys_id = + List.fold_left keys ~init:ImportContainers.IntSet.empty + ~f:(fun acc (keys : ImportAnalyser.Dependency.key) -> + let columns = Lazy.force keys.ImportAnalyser.Dependency.columns in + ImportContainers.IntSet.union acc columns) + in + + let** _ = + List.fold_left values ~init:(Ok 1) ~f:(fun idx (id, value) -> + let** idx = idx in + + (* Ensure the column is required in the keys *) + match ImportContainers.IntSet.mem (1 + id) keys_id with + | false -> Ok (idx + 1) + | true -> + let sql_data = T.of_datatype value in + + let col_name = ":col_" ^ string_of_int (1 + id) in + let* _ = Sqlite3.bind_name statement col_name sql_data in + Ok (idx + 1)) + in + + let result, evaluated_keys = + Sqlite3.fold statement ~init:[] ~f:(fun _ v -> Array.to_list v) + in + let* _ = result in + Ok (Some statement, evaluated_keys) + +let insert : + Sqlite3.db -> + Sqlite3.stmt -> + id:int -> + (int * CSV.DataType.t) list -> + unit T.result = + fun db statement ~id values -> + let** _ = T.savepoint db "PREVIOUS" in + let* _ = + Sqlite3.bind_name statement ":id" (Sqlite3.Data.INT (Int64.of_int id)) + in + let** _ = + List.fold_left values ~init:(Ok 1) ~f:(fun idx (id, value) -> + let** idx = idx in + let sql_data = T.of_datatype value in + + let col_name = ":col_" ^ string_of_int (1 + id) in + let* _ = Sqlite3.bind_name statement col_name sql_data in + + Ok (idx + 1)) + in + + match T.to_result (Sqlite3.step statement) with + | Ok () -> T.release db "PREVIOUS" + | Error e -> + (* I intentionnaly ignore any error here, as we are already in an + error case *) + ignore (Sqlite3.exec db "ROLLBACK TRANSACTION TO SAVEPOINT PREVIOUS"); + Error e + +(** This simple function convert a query generated by the application into a + statement executed with sqlite. + + The function expect a perfect match between the query and the parameters. *) +let execute_query : + Sqlite3.db -> ImportAnalyser.Query.query -> Sqlite3.stmt T.result = + fun db query -> + let statement = + try Sqlite3.prepare db query.q with + | e -> + print_endline "Error during this query :"; + print_endline query.q; + raise e + in + + (* Add the binded parameters *) + let values = + Seq.map (fun v -> T.of_datatype v) query.parameters |> List.of_seq + in + + let* _ = Sqlite3.bind_values statement values in + + Ok statement + +let query : + f:((Path.t ImportExpression.T.t * CSV.DataType.t) array -> unit) -> + Sqlite3.db -> + Syntax.t -> + unit T.result = + fun ~f db output -> + (* Extract the query from the configuration. *) + let** query_analysis = + match ImportAnalyser.Query.select output with + | exception e -> Error e + | other -> Ok other + in + + let query, columns = query_analysis in + let** statement = execute_query db query in + + let* _ = + Sqlite3.iter statement ~f:(fun data -> + let values = + Array.map2 data columns ~f:(fun value column -> + (column, T.to_datatype value)) + in + f values) + in + Ok () + +let create_view : Sqlite3.db -> Syntax.t -> unit T.result = + fun db output -> + ignore output; + let* drop = Sqlite3.exec db "DROP VIEW IF EXISTS 'result'" in + ignore drop; + + Ok () + +(* + let query, _ = ImportAnalyser.Query.select output in + + let query = + { query with q = Printf.sprintf "CREATE VIEW result AS %s" query.q } + in + let** statement = execute_query db query in + + Sqlite3.step statement |> T.to_result + *) + +let check_foreign : + f:((string * CSV.DataType.t) array -> unit) -> + Sqlite3.db -> + Syntax.t -> + Syntax.extern -> + unit T.result = + fun ~f db conf external_ -> + let query = ImportAnalyser.Query.check_external conf external_ in + + let** statement = execute_query db query in + Sqlite3.iter statement ~f:(fun data -> + let values = + Array.mapi data ~f:(fun i value -> + (Sqlite3.column_name statement i, T.to_datatype value)) + in + f values) + |> T.to_result + +let clear_duplicates : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + ImportDataTypes.Table.t -> + ImportAnalyser.Dependency.key list -> + unit T.result = + fun ~f db table keys -> + let table_name = Table.name table in + + List.fold_left keys ~init:(Ok ()) + ~f:(fun state { ImportAnalyser.Dependency.name; _ } -> + let** _ = state in + + let select_query = + String.concat ~sep:"" + [ + "SELECT '"; + table_name; + "'.id, '"; + table_name; + "'.'key_"; + name; + "', '"; + name; + "' FROM '"; + table_name; + "' INNER JOIN (SELECT id, row_number() OVER(PARTITION BY '"; + table_name; + "'.'key_"; + name; + "' ORDER BY (id)) AS row_num from '"; + table_name; + "') other_table WHERE other_table.row_num <> 1 and '"; + table_name; + "'.id = other_table.id"; + ] + in + let stmt = Sqlite3.prepare db select_query in + + ignore + @@ Sqlite3.iter stmt ~f:(fun data -> + let values = + Array.mapi data ~f:(fun i value -> + (Sqlite3.column_name stmt i, T.to_datatype value)) + in + f values); + + let delete_query = + Printf.sprintf + {|UPDATE '%s' +SET key_%s = NULL +FROM +( + SELECT id, row_number() OVER(PARTITION BY key_%s ORDER BY (id)) AS row_num + from '%s' +) other_table +WHERE other_table.row_num <> 1 +and '%s'.id = other_table.id|} + table_name name name table_name table_name + in + + Sqlite3.exec db delete_query |> T.to_result) + +type 'a result = ('a, exn) Result.t diff --git a/lib/sql/db.mli b/lib/sql/db.mli new file mode 100644 index 0000000..465b159 --- /dev/null +++ b/lib/sql/db.mli @@ -0,0 +1,106 @@ +module Syntax = ImportConf.Syntax + +type 'a t +type 'a result = ('a, exn) Result.t + +val with_db : string -> ('a t -> unit result) -> unit result + +val check_table_schema : 'a t -> ImportAnalyser.Dependency.t -> bool result +(** Check if a table with the same structure already exists in the database. + + This query allow to reuse the same data without reloading the file if + nothing changed. *) + +val create_table : 'a t -> ImportAnalyser.Dependency.t -> unit result +(** [create_table db name] will create a new table in the + db with the given name, and the columns from the configuration (see + [ImportAnalyser.Query.create_table]) + + Any previous table with the same name will be deleted. *) + +val prepare_insert : 'a t -> ImportAnalyser.Dependency.t -> Sqlite3.stmt result +(** Create a statement to use in an insert. + [prepare_insert db table] will prepare a statement for inserting + the columns at the given index. *) + +val finalize : Sqlite3.stmt -> unit result +(** Finalize the statement. The function shall be called once each insert are + done, or after an error in the insert. *) + +val reset : Sqlite3.stmt -> unit result + +val eval_key : + 'a t -> + Sqlite3.stmt option -> + ImportAnalyser.Dependency.key list -> + (int * ImportCSV.DataType.t) list -> + (Sqlite3.stmt option * Sqlite3.Data.t list) result +(** Evaluate the keys in sqlite and get the results. + + The function is intended to check if the values are null before inserting + them in a batch *) + +val insert : + 'a t -> + Sqlite3.stmt -> + id:int -> + (int * ImportCSV.DataType.t) list -> + unit result +(** Insert a new row in the database. + + [insert db ~id statement values] will add a new row in the given table with + [id]. The columns are identified with their index number (there is a + difference by one with the column number) + + Thanks to SQLite Flexible Typing (https://www.sqlite.org/flextypegood.html) + each column can contain values typed differently which is how the spreadsheet + also works. + + This function is expected to be run inside a transaction. *) + +val begin_transaction : 'a t -> unit result +val commit : 'a t -> unit result +val rollback : 'a t -> unit result + +val query : + f: + ((ImportDataTypes.Path.t ImportExpression.T.t * ImportCSV.DataType.t) array -> + unit) -> + 'a t -> + Syntax.t -> + unit result +(** This one the most important function from the application. The function + will transform the configuration into an sql query and will fetch the + result from the sqlite engine. + + The function [f] given in argument will be called for each line + + *) + +val create_view : 'a t -> Syntax.t -> unit result +(** Create a view which represent the result *) + +val check_foreign : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + Syntax.t -> + Syntax.extern -> + unit result + +val clear_duplicates : + f:((string * ImportCSV.DataType.t) array -> unit) -> + 'a t -> + ImportDataTypes.Table.t -> + ImportAnalyser.Dependency.key list -> + unit result +(** Remove all duplicated keys in the table by setting them to NULL. *) + +val insert_header : + 'a t -> + ImportDataTypes.Table.t -> + (int * ImportCSV.DataType.t) array -> + unit T.result + +val query_headers : + 'a t -> ImportDataTypes.Table.t -> ImportCSV.DataType.t array T.result +(** Get all the headers from the database (used or not) *) diff --git a/lib/sql/dune b/lib/sql/dune new file mode 100644 index 0000000..9f9f205 --- /dev/null +++ b/lib/sql/dune @@ -0,0 +1,15 @@ +(library + (name importSQL) + (libraries + re + sqlite3 + calendar + importAnalyser + importCSV + importConf + importContainers + importDataTypes + importErrors + importExpression + ) +) diff --git a/lib/sql/hashs.ml b/lib/sql/hashs.ml new file mode 100644 index 0000000..af1f092 --- /dev/null +++ b/lib/sql/hashs.ml @@ -0,0 +1,79 @@ +(** + This module store the hash of the indexes ([extern_key]) for each table in + order to update the file if the configuration changed. + + The hashes are stored in a table named [hashes] and are evaluated just + before inserting the values. +*) + +open StdLabels +module Table = ImportDataTypes.Table + +let ( let* ) = Result.bind + +let create_table : 'a T.t -> unit T.result = + fun db -> + Sqlite3.exec db + "CREATE TABLE IF NOT EXISTS 'hashes' ('table' TEXT, 'hash' INTEGER, \ + PRIMARY KEY ('table'))" + |> T.to_result + +let evaluate : ImportAnalyser.Dependency.t -> int = + fun table -> + (* Extract all the references to this table *) + let keys = + List.map (ImportAnalyser.Dependency.keys table) + ~f:(fun ImportAnalyser.Dependency.{ name; columns; expression } -> + ignore columns; + (name, expression)) + in + Hashtbl.hash keys + +let insert : 'a T.t -> ImportAnalyser.Dependency.t -> unit T.result = + fun db table -> + let source = ImportAnalyser.Dependency.table table in + let table_name = Table.name source in + + let hash = evaluate table in + + let query = + String.concat ~sep:"" + [ + "INSERT INTO 'hashes' ('table', 'hash') VALUES ('"; + table_name; + "', :hash) ON CONFLICT(hashes.'table') DO UPDATE SET 'hash' = :hash"; + ] + in + let* statement = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + + let* _ = T.begin_transaction db in + let sql_data = Sqlite3.Data.INT (Int64.of_int hash) in + + let* _ = Sqlite3.bind_name statement ":hash" sql_data |> T.to_result in + let* _ = T.to_result (Sqlite3.step statement) in + T.commit db + +let query : 'a T.t -> ImportDataTypes.Table.t -> int option T.result = + fun db table -> + let table_name = Table.name table in + let query = + String.concat ~sep:"" + [ "SELECT hash FROM 'hashes' WHERE hashes.'table' = '"; table_name; "'" ] + in + + let* stmt = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + let state, res = + Sqlite3.fold stmt ~init:None ~f:(fun _ d -> + Some (T.to_datatype (Array.get d 0))) + in + + let* _ = T.to_result state in + match res with + | Some (ImportCSV.DataType.Integer i) -> Ok (Some i) + | _ -> Ok None diff --git a/lib/sql/header.ml b/lib/sql/header.ml new file mode 100644 index 0000000..3cac5fb --- /dev/null +++ b/lib/sql/header.ml @@ -0,0 +1,74 @@ +open StdLabels +module Table = ImportDataTypes.Table + +let ( let* ) = Result.bind + +let create_table : 'a T.t -> unit T.result = + fun db -> + Sqlite3.exec db + "CREATE TABLE IF NOT EXISTS 'header' ('table' TEXT, 'column' INTEGER, \ + 'label', PRIMARY KEY ('table', 'column'))" + |> T.to_result + +let insert_header : + 'a T.t -> + ImportDataTypes.Table.t -> + (int * ImportCSV.DataType.t) array -> + unit T.result = + fun db table values -> + let table_name = Table.name table in + + let query = + String.concat ~sep:"" + [ + "INSERT INTO 'header' ('table', 'column', 'label') VALUES ('"; + table_name; + "', :column, :label) ON CONFLICT(header.'table', header.'column') DO \ + UPDATE SET 'label' = :label"; + ] + in + + let statement = Sqlite3.prepare db query in + + let* _ = T.begin_transaction db in + let* _ = + Array.fold_left values ~init:(Ok ()) ~f:(fun acc (column, value) -> + let* _ = acc in + let sql_data = T.of_datatype value in + let* _ = Sqlite3.bind_name statement ":label" sql_data |> T.to_result in + + let* _ = + Sqlite3.bind_name statement ":column" + (Sqlite3.Data.INT (Int64.of_int column)) + |> T.to_result + in + let* _ = T.to_result (Sqlite3.step statement) in + T.reset statement) + in + T.commit db + +let query_headers : + 'a T.t -> ImportDataTypes.Table.t -> ImportCSV.DataType.t array T.result = + fun db table -> + let table_name = Table.name table in + let query = + String.concat ~sep:"" + [ + "SELECT label FROM 'header' WHERE header.'table' = '"; + table_name; + "' ORDER BY column DESC"; + ] + in + + let* stmt = + try Ok (Sqlite3.prepare db query) with + | e -> Error e + in + let state, res = + Sqlite3.fold stmt ~init:[] ~f:(fun acc d -> + let value = T.to_datatype (Array.get d 0) in + value :: acc) + in + + let* _ = T.to_result state in + Ok (Array.of_list res) diff --git a/lib/sql/join.ml b/lib/sql/join.ml new file mode 100644 index 0000000..3f82b92 --- /dev/null +++ b/lib/sql/join.ml @@ -0,0 +1,30 @@ +module D = Sqlite3.Data + +let f : Sqlite3.Data.t array -> Sqlite3.Data.t = + fun arguments -> + if Array.length arguments < 2 then Sqlite3.Data.NULL + else + let sep = Array.get arguments 0 in + + (* Shift all the elements into an list*) + let contents = + Array.to_seqi arguments + |> Seq.filter_map (fun (i, value) -> + if i = 0 then None + else + match value with + | D.INT i -> Some (Int64.to_string i) + | D.FLOAT f -> Some (Float.to_string f) + | D.NONE -> None + | D.NULL -> None + | D.TEXT s | D.BLOB s -> + if String.length s = 0 then None else Some (String.trim s)) + |> List.of_seq + in + + D.TEXT (String.concat (D.to_string_coerce sep) contents) + +let register : Sqlite3.db -> unit = + fun db -> + Sqlite3.create_funN db "join" f; + Sqlite3.create_funN db "concat" f diff --git a/lib/sql/match.ml b/lib/sql/match.ml new file mode 100644 index 0000000..82fc1da --- /dev/null +++ b/lib/sql/match.ml @@ -0,0 +1,22 @@ +let f : Sqlite3.Data.t -> Sqlite3.Data.t -> Sqlite3.Data.t = + let memo = Hashtbl.create 16 in + fun str data_1 -> + match (str, data_1) with + | Sqlite3.Data.TEXT s, Sqlite3.Data.TEXT content -> ( + let regex = + match Hashtbl.find_opt memo s with + | None -> + let regex = Re.Posix.compile_pat s in + Hashtbl.add memo s regex; + regex + | Some v -> v + in + + match Re.exec_opt regex content with + | None -> Sqlite3.Data.NULL + | Some g -> + let matched = Re.Group.get g 1 in + Sqlite3.Data.TEXT matched) + | _, _ -> data_1 + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun2 db "match" f diff --git a/lib/sql/math.ml b/lib/sql/math.ml new file mode 100644 index 0000000..576d9f6 --- /dev/null +++ b/lib/sql/math.ml @@ -0,0 +1,20 @@ +(** Math functions *) + +let int : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + (* If the data is already an int, do not change it *) + | Sqlite3.Data.INT _ -> data + | Sqlite3.Data.FLOAT content -> Sqlite3.Data.INT (Int64.of_float content) + | Sqlite3.Data.BLOB content | Sqlite3.Data.TEXT content -> begin + match Int64.of_string_opt content with + | Some i -> Sqlite3.Data.INT i + | None -> begin + match Float.of_string_opt content with + | Some f -> Sqlite3.Data.INT (Int64.of_float f) + | None -> Sqlite3.Data.NULL + end + end + | _ -> Sqlite3.Data.NULL + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "int" int diff --git a/lib/sql/t.ml b/lib/sql/t.ml new file mode 100644 index 0000000..202c535 --- /dev/null +++ b/lib/sql/t.ml @@ -0,0 +1,52 @@ +exception SqlError of Sqlite3.Rc.t + +type 'a t = Sqlite3.db +type 'a result = ('a, exn) Result.t + +let to_result : Sqlite3.Rc.t -> unit result = function + | Sqlite3.Rc.OK | Sqlite3.Rc.DONE -> Ok () + | res -> Error (SqlError res) + +let begin_transaction : Sqlite3.db -> unit result = + fun db -> + let query = "BEGIN" in + Sqlite3.exec db query |> to_result + +let commit : Sqlite3.db -> unit result = + fun db -> + let query = "COMMIT" in + Sqlite3.exec db query |> to_result + +let rollback : Sqlite3.db -> unit result = + fun db -> + let query = "ROLLBACK" in + Sqlite3.exec db query |> to_result + +let savepoint : Sqlite3.db -> string -> unit result = + fun db name -> + let query = "SAVEPOINT " ^ name in + Sqlite3.exec db query |> to_result + +let release : Sqlite3.db -> string -> unit result = + fun db name -> + let query = "RELEASE SAVEPOINT " ^ name in + Sqlite3.exec db query |> to_result + +let finalize : Sqlite3.stmt -> unit result = + fun statement -> to_result (Sqlite3.finalize statement) + +let reset : Sqlite3.stmt -> unit result = + fun statement -> to_result (Sqlite3.reset statement) + +let of_datatype = function + | ImportCSV.DataType.Float f -> Sqlite3.Data.FLOAT f + | ImportCSV.DataType.Integer i -> Sqlite3.Data.INT (Int64.of_int i) + | ImportCSV.DataType.Null -> Sqlite3.Data.NULL + | ImportCSV.DataType.Error _ -> Sqlite3.Data.NULL + | ImportCSV.DataType.Content s -> Sqlite3.Data.TEXT s + +let to_datatype : Sqlite3.Data.t -> ImportCSV.DataType.t = function + | Sqlite3.Data.NONE | Sqlite3.Data.NULL -> ImportCSV.DataType.Null + | Sqlite3.Data.INT i -> ImportCSV.DataType.Integer (Int64.to_int i) + | Sqlite3.Data.FLOAT f -> ImportCSV.DataType.Float f + | Sqlite3.Data.TEXT t | Sqlite3.Data.BLOB t -> ImportCSV.DataType.Content t diff --git a/lib/sql/trim.ml b/lib/sql/trim.ml new file mode 100644 index 0000000..4e4bcf4 --- /dev/null +++ b/lib/sql/trim.ml @@ -0,0 +1,9 @@ +(* Override the trim function with another which also remove the retchar *) + +let f : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + | Sqlite3.Data.TEXT content -> Sqlite3.Data.TEXT (String.trim content) + | _ -> data + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "trim" f diff --git a/lib/sql/year.ml b/lib/sql/year.ml new file mode 100644 index 0000000..1e9c599 --- /dev/null +++ b/lib/sql/year.ml @@ -0,0 +1,19 @@ +(** Parse a text value into a date *) + +open CalendarLib + +let first_day = CalendarLib.Date.make 1899 12 30 + +let f : Sqlite3.Data.t -> Sqlite3.Data.t = + fun data -> + match data with + | Sqlite3.Data.INT content -> + let nb = Int64.to_int content in + let date = Date.add first_day (Date.Period.day nb) in + let year = CalendarLib.Date.year date in + + Sqlite3.Data.INT (Int64.of_int year) + | _ -> data + + +let register : Sqlite3.db -> unit = fun db -> Sqlite3.create_fun1 db "year" f diff --git a/lib/tools/dune b/lib/tools/dune new file mode 100644 index 0000000..29ee744 --- /dev/null +++ b/lib/tools/dune @@ -0,0 +1,10 @@ +(library + (name tools)) + +(rule + (target git_hash.ml) + (deps git_head.sh (universe)) + (action + (with-stdout-to + %{target} + (bash "./git_head.sh")))) diff --git a/lib/tools/git_head.sh b/lib/tools/git_head.sh new file mode 100755 index 0000000..bfa738a --- /dev/null +++ b/lib/tools/git_head.sh @@ -0,0 +1,11 @@ +#! /bin/sh +# Include the git hash in an OCaml file. + +git diff-index --quiet HEAD -- +if [ $? -ne 0 ]; then + revision=": untracked" +else + revision=$(git rev-parse --short HEAD) +fi +compile_date=$(date +%Y/%m/%d) +echo "let revision = \"${revision} - compiled on ${compile_date}\"" -- cgit v1.2.3