diff options
-rw-r--r-- | examples/externals_filters.toml | 22 | ||||
-rw-r--r-- | examples/importer_groupe.toml | 15 | ||||
-rw-r--r-- | lib/analysers/chunk.ml | 87 | ||||
-rw-r--r-- | lib/analysers/chunk.mli | 11 | ||||
-rw-r--r-- | lib/analysers/dependency.ml | 66 | ||||
-rw-r--r-- | lib/analysers/dependency.mli | 5 | ||||
-rw-r--r-- | lib/analysers/filters.ml | 27 | ||||
-rw-r--r-- | lib/analysers/query.ml | 83 | ||||
-rw-r--r-- | lib/configuration/read_conf.ml | 6 | ||||
-rwxr-xr-x | lib/file_handler/dune | 1 | ||||
-rw-r--r-- | lib/file_handler/state.ml | 4 | ||||
-rw-r--r-- | lib/sql/db.ml | 155 | ||||
-rw-r--r-- | lib/sql/db.mli | 6 | ||||
-rw-r--r-- | lib/sql/hashs.ml | 14 | ||||
-rw-r--r-- | lib/syntax/importerSyntax.ml | 25 | ||||
-rw-r--r-- | lib/syntax/importerSyntax.mli | 2 | ||||
-rw-r--r-- | readme.rst | 18 | ||||
-rw-r--r-- | tests/analyser_dependency.ml | 8 | ||||
-rw-r--r-- | tests/analyser_filters.ml | 20 | ||||
-rw-r--r-- | tests/analyser_query_test.ml | 95 | ||||
-rw-r--r-- | tests/confLoader.ml | 4 | ||||
-rw-r--r-- | tests/configuration_toml.ml | 37 |
22 files changed, 487 insertions, 224 deletions
diff --git a/examples/externals_filters.toml b/examples/externals_filters.toml new file mode 100644 index 0000000..78e6603 --- /dev/null +++ b/examples/externals_filters.toml @@ -0,0 +1,22 @@ +dataset = "dataset.toml" + +[source] + name = "source" + +# This show how the filter in external works. +# Only the lines with Column C = 'France' will be +# matched. +[externals.source-target] + intern_key = ":source.A" + extern_key = ":A" + filters = [ + ":C = 'France'" + ] + +[sheet] + columns = [ + ":A", + ":C", + ":source-target.A", + ":source-target.C", + ] diff --git a/examples/importer_groupe.toml b/examples/importer_groupe.toml index eb2f7e6..ea2c7de 100644 --- a/examples/importer_groupe.toml +++ b/examples/importer_groupe.toml @@ -1,15 +1,12 @@ -# Cet exemple permet de montrer la combinaison de fonction de groupe avec des -# filtres. +# This file show how the groups can be used in filters. +# +# We search for the greatest value of units sold in France. In order to get the +# expected result, we need to filter first the country in France before +# looking for the greatest value. -# On recherche ici le plus grand nombre d’unitées vendues en France. -# Pour que le résultat soit conforme à l’attendu, il faut que l’application -# commence par filtrer les lignes qui concernent la France, avant de chercher -# la ligne contenant le plus grand nombre d’unités. - -version = 1 +dataset = "dataset.toml" [source] - file = "financial.xlsx" name = "source" [sheet] diff --git a/lib/analysers/chunk.ml b/lib/analysers/chunk.ml index cefa6d8..b09f311 100644 --- a/lib/analysers/chunk.ml +++ b/lib/analysers/chunk.ml @@ -43,9 +43,24 @@ let add_parameters : t -> ImportDataTypes.Value.t Seq.t -> unit = fun t p -> Queue.add_seq t.parameters p module Table = ImportDataTypes.Table -module Q = ImportExpression.Query open StdLabels +let add_expression : + repr:(Format.formatter -> 'a -> unit) -> + t -> + 'a ImportExpression.T.t -> + unit = + fun ~repr group expression -> + let formatter = Format.formatter_of_buffer group.b in + Format.pp_print_char formatter '('; + let queue = + ImportExpression.Query.query_of_expression ImportExpression.Query.BindParam + formatter repr expression + in + Format.pp_print_char formatter ')'; + Format.pp_print_flush formatter (); + add_parameters group (Queue.to_seq queue) + (** Extract the informations from the dependancies. We get two informations here : @@ -57,20 +72,34 @@ let join_external : fun ~conf ~join_buffer external_ -> let extern_table = Table.name external_.target in - let formatter = Format.formatter_of_buffer join_buffer.b in - Format.fprintf formatter "\nLEFT JOIN '%s' AS '%s' ON %t = %s" extern_table - external_.target.name - (Printers.prepare_key ~f:(fun f -> - let q = - Q.query_of_expression Q.BindParam f (Printers.path ~conf) - external_.intern_key - in - - add_parameters join_buffer (Queue.to_seq q))) - (Table.print_column external_.ImporterSyntax.Extern.target - ("key_" ^ external_.ImporterSyntax.Extern.target.name)); - - Format.pp_print_flush formatter () + add_string join_buffer "\nLEFT JOIN '"; + add_string join_buffer extern_table; + add_string join_buffer "' AS '"; + add_string join_buffer external_.target.name; + add_string join_buffer "' ON "; + add_string join_buffer + (Format.asprintf "%t = %s" + (Printers.prepare_key ~f:(fun f -> + let q = + ImportExpression.Query.query_of_expression + ImportExpression.Query.BindParam f (Printers.path ~conf) + external_.intern_key + in + + add_parameters join_buffer (Queue.to_seq q))) + (Table.print_column external_.ImporterSyntax.Extern.target + ("key_" ^ external_.ImporterSyntax.Extern.target.name))); + + (* Add the filters given for this external in the query *) + let table = external_.ImporterSyntax.Extern.target + and filters = external_.ImporterSyntax.Extern.filters in + List.iter filters ~f:(fun f -> + add_string join_buffer " AND "; + add_expression + ~repr:(fun formatter column -> + Format.fprintf formatter "%s" + (Table.print_column table ("col_" ^ string_of_int column))) + join_buffer f) (** Create the from part of the query, adding all the required externals (even when not required) @@ -90,16 +119,26 @@ let create_from_statement_of_chunck : (* Add the externals in the query *) List.iter externals ~f:(join_external ~conf ~join_buffer:c) -let add_expression : - conf:ImporterSyntax.t -> +(** Add a list of expressions into the group *) +let add_expressions : + repr:(Format.formatter -> 'a -> unit) -> + sep:string -> t -> - ImportDataTypes.Path.t ImportExpression.T.t -> + 'a ImportExpression.T.t list -> unit = - fun ~conf group expression -> + fun ~repr ~sep group exppressions -> let formatter = Format.formatter_of_buffer group.b in - let queue = - ImportExpression.Query.query_of_expression ImportExpression.Query.BindParam - formatter (Printers.path ~conf) expression + let () = + Format.pp_print_list + ~pp_sep:(fun f () -> Format.pp_print_string f sep) + (fun formatter column -> + Format.pp_print_char formatter '('; + let seq = + ImportExpression.Query.query_of_expression + ImportExpression.Query.BindParam formatter repr column + in + Format.pp_print_char formatter ')'; + Queue.transfer seq group.parameters) + formatter exppressions in - Format.pp_print_flush formatter (); - add_parameters group (Queue.to_seq queue) + Format.pp_print_flush formatter () diff --git a/lib/analysers/chunk.mli b/lib/analysers/chunk.mli index 13a748a..ad9ca00 100644 --- a/lib/analysers/chunk.mli +++ b/lib/analysers/chunk.mli @@ -27,8 +27,13 @@ val create_from_statement_of_chunck : the select clause. *) val add_expression : - conf:ImporterSyntax.t -> + repr:(Format.formatter -> 'a -> unit) -> t -> 'a ImportExpression.T.t -> unit +(** Add an expression into an existing chunck *) + +val add_expressions : + repr:(Format.formatter -> 'a -> unit) -> + sep:string -> t -> - ImportDataTypes.Path.t ImportExpression.T.t -> + 'a ImportExpression.T.t list -> unit -(** Add an expression into an existing chunck *) +(** Add a list of expressions into an existing chunk *) diff --git a/lib/analysers/dependency.ml b/lib/analysers/dependency.ml index 38bc23c..8c969fe 100644 --- a/lib/analysers/dependency.ml +++ b/lib/analysers/dependency.ml @@ -1,7 +1,5 @@ open StdLabels module IntSet = ImportContainers.IntSet -module Table = ImportDataTypes.Table -module Path = ImportDataTypes.Path module Expression = ImportExpression.T (* @@ -19,13 +17,14 @@ type deps = (ImportContainers.Source.t * ImportContainers.Source.t list) list type key = { name : string; - expression : Path.column Expression.t; + expression : ImportDataTypes.Path.column Expression.t; columns : ImportContainers.IntSet.t Lazy.t; + filters : ImportDataTypes.Path.column ImportExpression.T.t list; } [@@deriving show, eq] type t = { - table : Table.t; + table : ImportDataTypes.Table.t; columns : IntSet.t; keys : key list; } @@ -47,8 +46,8 @@ type build_map = t ImportContainers.Externals.t - [of_path] is need to extract the qualified source from any kind of path. *) type 'a expression_extractor = { - to_mapping : t -> Path.column -> t; - of_path : 'a -> string option * Path.column; + to_mapping : t -> ImportDataTypes.Path.column -> t; + of_path : 'a -> string option * ImportDataTypes.Path.column; } (** [add_path_in_map f parent path ] Extract the column from element [path] and @@ -112,9 +111,15 @@ let add_columns_in_map : This function is called for each path declared inside the expression. *) let add_dependancies : - conf:ImporterSyntax.t -> ImporterSyntax.Extern.t -> deps -> Path.t -> deps = + conf:ImporterSyntax.t -> + ImporterSyntax.Extern.t -> + deps -> + ImportDataTypes.Path.t -> + deps = fun ~conf extern graph path -> - let source_table = ImporterSyntax.get_table_for_name conf path.Path.alias in + let source_table = + ImporterSyntax.get_table_for_name conf path.ImportDataTypes.Path.alias + in let source = ImportContainers.Source.from_table source_table in let target = ImportContainers.Source.from_table extern.target in @@ -134,7 +139,10 @@ let add_external_in_map : let _ = Expression.fold_values extern.intern_key ~init:() ~f:(fun () path -> try - let _ = ImporterSyntax.get_table_for_name conf path.Path.alias in + let _ = + ImporterSyntax.get_table_for_name conf + path.ImportDataTypes.Path.alias + in () with | Not_found -> ( @@ -147,16 +155,19 @@ let add_external_in_map : raise (ImportErrors.Unknown_source root.name))) in + let columns () = + let f = fun acc k -> ImportContainers.IntSet.add k acc in + Expression.fold_values extern.extern_key ~f + ~init:ImportContainers.IntSet.empty + in + (* Create the new key with all the expression and all the columns inside it *) let new_key = { - name = extern.target.Table.name; + name = extern.target.ImportDataTypes.Table.name; expression = extern.extern_key; - columns = - lazy - (Expression.fold_values extern.extern_key - ~f:(fun acc k -> ImportContainers.IntSet.add k acc) - ~init:ImportContainers.IntSet.empty); + columns = Lazy.from_fun columns; + filters = extern.filters; } in let build_map = @@ -182,7 +193,7 @@ let add_external_in_map : ~f: { of_path = - (fun Path.{ alias; column } -> + (fun ImportDataTypes.Path.{ alias; column } -> let table = ImporterSyntax.get_table_for_name conf alias in (Some table.name, column)); to_mapping = @@ -197,7 +208,8 @@ let mapper = to_mapping = (fun mapping column -> { mapping with columns = IntSet.add column mapping.columns }); - of_path = (fun ({ alias; column } : Path.t) -> (alias, column)); + of_path = + (fun ({ alias; column } : ImportDataTypes.Path.t) -> (alias, column)); } let get_mapping : ImporterSyntax.t -> build_map * deps = @@ -221,7 +233,24 @@ let get_mapping : ImporterSyntax.t -> build_map * deps = in let map, graph = List.fold_left conf.externals ~init ~f:(fun map extern -> - add_external_in_map ~conf extern map) + let map, graph = add_external_in_map ~conf extern map in + + (* Also add the filters in the externals. The column are not defined as + a full path, with table and column, but only with a column. We need + to transform them to make them present as the same as the others *) + let table = + ImporterSyntax.get_table_for_name conf (Some extern.target.name) + in + let path_filters = + List.map extern.filters ~f:(fun expression -> + Expression.map + ~f:(fun column -> + ImportDataTypes.Path.{ alias = Some table.name; column }) + expression) + in + let map = add_columns_in_map ~conf ~f:mapper path_filters map in + + (map, graph)) in (* Now we don’t bother anymore with the graph and it’s dependency, we just @@ -233,6 +262,7 @@ let get_mapping : ImporterSyntax.t -> build_map * deps = |> add_columns_in_map ~conf ~f:mapper conf.filters |> add_columns_in_map ~conf ~f:mapper conf.uniq in + (map, graph) let get_process_order : ImporterSyntax.t -> t list = diff --git a/lib/analysers/dependency.mli b/lib/analysers/dependency.mli index 522436c..1eb55c5 100644 --- a/lib/analysers/dependency.mli +++ b/lib/analysers/dependency.mli @@ -29,7 +29,10 @@ type key = { (** The list of columns used in the key. All the columns are referenced in the expression. We can have many columns used inside a single key when a function is used (for example for joining multiple columns into a - single key) *) + single key). + + The columns used in the filter are also declared as well. *) + filters : ImportDataTypes.Path.column ImportExpression.T.t list; } [@@deriving show, eq] (** This type describe the join key in a table. The name is the refering table diff --git a/lib/analysers/filters.ml b/lib/analysers/filters.ml index 6b1d843..15e8cda 100644 --- a/lib/analysers/filters.ml +++ b/lib/analysers/filters.ml @@ -4,25 +4,6 @@ module Path = ImportDataTypes.Path module Expression = ImportExpression open StdLabels -(** Add a list of expressions into the group *) -let add_filters : - conf:ImporterSyntax.t -> Chunk.t -> Path.t Expression.T.t list -> unit = - fun ~conf group -> function - | [] -> () - | any -> - let rec f ~conf group = function - | [] -> () - | hd :: [] -> - Chunk.add_expression ~conf group hd; - Chunk.add_string group ")" - | hd :: tl -> - Chunk.add_expression ~conf group hd; - Chunk.add_string group ")\nAND ("; - f ~conf group tl - in - Chunk.add_string group "("; - f ~conf group any - type 'a cte_acc = { n : int; has_previous : bool; @@ -65,7 +46,7 @@ let print : Chunk.add_string query "SELECT "; Chunk.add_string query conf.source.name; Chunk.add_string query ".id, "; - Chunk.add_expression ~conf query expression; + Chunk.add_expression ~repr:(Printers.path ~conf) query expression; Chunk.add_string query " AS group_function"; Chunk.create_from_statement_of_chunck conf query; @@ -83,7 +64,8 @@ let print : | [] -> () | _ -> Chunk.add_string query " WHERE "; - add_filters ~conf query cte.ImporterSyntax.CTE.filters + Chunk.add_expressions ~sep:"\nAND " ~repr:(Printers.path ~conf) + query cte.ImporterSyntax.CTE.filters end; Chunk.add_string query ")\n"; Some acc.n @@ -91,7 +73,8 @@ let print : (* Do not add the filters in the CTE (we don’t have any) but in the main query *) Chunk.add_string predicates "WHERE "; - add_filters ~conf predicates cte.ImporterSyntax.CTE.filters; + Chunk.add_expressions ~sep:"\nAND " ~repr:(Printers.path ~conf) + predicates cte.ImporterSyntax.CTE.filters; acc.cte_index in { diff --git a/lib/analysers/query.ml b/lib/analysers/query.ml index f89f5f0..dac4d89 100644 --- a/lib/analysers/query.ml +++ b/lib/analysers/query.ml @@ -135,41 +135,24 @@ let select : ImporterSyntax.t -> query * Path.t ImportExpression.T.t array = let () = Chunk.create_from_statement_of_chunck conf request_header in Chunk.append ~head:request_header ~tail:filters; - let formatter = Format.formatter_of_buffer b in (match conf.ImporterSyntax.uniq with | [] -> () | uniq -> - Format.fprintf formatter "\nGROUP BY %a" - (Format.pp_print_list - ~pp_sep:(fun f () -> Format.fprintf f ", ") - (fun formatter column -> - let seq = - Q.query_of_expression Q.BindParam formatter (Printers.path ~conf) - column - in - Queue.transfer seq parameters)) + Chunk.add_string request_header "\nGROUP BY "; + Chunk.add_expressions ~repr:(Printers.path ~conf) ~sep:", " request_header uniq); (match conf.ImporterSyntax.sort with | [] -> () | sort -> - Format.fprintf formatter "\nORDER BY %a" - (Format.pp_print_list - ~pp_sep:(fun f () -> Format.fprintf f ", ") - (fun formatter column -> - let seq = - Q.query_of_expression Q.BindParam formatter (Printers.path ~conf) - column - in - Queue.transfer seq parameters)) + Chunk.add_string request_header "\nORDER BY "; + Chunk.add_expressions ~repr:(Printers.path ~conf) ~sep:", " request_header sort); - Format.pp_print_flush formatter (); - ({ q = Buffer.contents b; parameters = Queue.to_seq parameters }, headers) let check_external : ImporterSyntax.t -> ImporterSyntax.Extern.t -> query = fun conf external_ -> let internal_chunk = Chunk.create () in - Chunk.add_expression ~conf internal_chunk + Chunk.add_expression ~repr:(Printers.path ~conf) internal_chunk external_.ImporterSyntax.Extern.intern_key; let external_key_buffer = Buffer.create 16 in @@ -180,6 +163,7 @@ let check_external : ImporterSyntax.t -> ImporterSyntax.Extern.t -> query = let pointed_tables = pointed_tables conf external_.intern_key in let parameters = Queue.create () in + (* We do a copy before the transfert because the Queue is reused later in the query *) Queue.transfer (Queue.copy internal_chunk.parameters) parameters; @@ -195,44 +179,47 @@ let check_external : ImporterSyntax.t -> ImporterSyntax.Extern.t -> query = ImporterSyntax.Extern.t list -> ImporterSyntax.Extern.t list = fun table init -> - let res = - (* Do not add the same external if the value is already present *) - let init = - match List.find_opt init ~f:(fun ext -> table == ext) with - | None -> table :: init - | Some _ -> init - in - - Expression.T.fold_values ~init table.ImporterSyntax.Extern.intern_key - ~f:(fun acc expr -> - match expr.Path.alias with - | None -> acc - | Some _ as path -> ( - let table = ImporterSyntax.get_table_for_name conf path in - (* Look for this table in the externals *) - let external_opt = - List.find_opt conf.ImporterSyntax.externals ~f:(fun t -> - t.ImporterSyntax.Extern.target == table) - in - match external_opt with - | None -> acc - | Some ext -> collect_links ext acc)) + (* Do not add the same external if the value is already present *) + let init = + match + List.find_opt init ~f:(fun ext -> ImporterSyntax.Extern.equal table ext) + with + | None -> table :: init + | Some _ -> init in - res + + Expression.T.fold_values ~init table.ImporterSyntax.Extern.intern_key + ~f:(fun acc expr -> + match expr.Path.alias with + | None -> acc + | Some _ as path -> ( + let table = ImporterSyntax.get_table_for_name conf path in + (* Look for this table in the externals *) + let external_opt = + List.find_opt conf.ImporterSyntax.externals ~f:(fun t -> + t.ImporterSyntax.Extern.target == table) + in + match external_opt with + | None -> acc + | Some ext -> collect_links ext acc)) in let dependencies = collect_links external_ [] in let join_content = Buffer.contents external_key_buffer in let request = Chunk.create () in + Chunk.add_string request "SELECT "; + + (* Check if we can identify the line number in the file. It’s only possible + if we have a single source used as a key *) let () = match pointed_tables with - | [] -> - (* Otherwise, just return -1 *) - Chunk.add_string request "-1" | (table, _name) :: _ -> (* If we have a single source, extract the row number. *) Chunk.add_string request (Table.print_column table "id") + | [] -> + (* Otherwise, just return -1 *) + Chunk.add_string request "-1" in Chunk.add_string request ", "; Chunk.append ~head:request ~tail:(Chunk.copy internal_chunk); diff --git a/lib/configuration/read_conf.ml b/lib/configuration/read_conf.ml index c3c78cc..f78ff5b 100644 --- a/lib/configuration/read_conf.ml +++ b/lib/configuration/read_conf.ml @@ -306,6 +306,10 @@ module Make (S : Decoders.Decode.S) = struct and* tab = S.field_opt_or ~default:1 "tab" S.int and* allow_missing = S.field_opt_or ~default:false "allow_missing" S.bool + and* filters = + S.field_opt_or ~default:[] "filters" + (S.list + (self#parse_expression ~eq:Int.equal ExpressionParser.column)) in S.succeed @@ -315,7 +319,7 @@ module Make (S : Decoders.Decode.S) = struct extern_key; target = { name; file; tab }; allow_missing; - match_rule = None; + filters; } (** Load the configuration for an external file to link *) diff --git a/lib/file_handler/dune b/lib/file_handler/dune index 80b2ff6..6d5def0 100755 --- a/lib/file_handler/dune +++ b/lib/file_handler/dune @@ -9,6 +9,7 @@ core
lwt
lwt.unix
+ ppx_deriving.runtime
helpers
importDataTypes
importContainers
diff --git a/lib/file_handler/state.ml b/lib/file_handler/state.ml index 7cf57da..e073fd3 100644 --- a/lib/file_handler/state.ml +++ b/lib/file_handler/state.ml @@ -151,7 +151,9 @@ let clear : ImporterSyntax.t -> unit ImportSQL.Db.result = fun ~log_error db mapping conf -> - ImportSQL.Db.clear_duplicates db (A.table mapping) (A.keys mapping) + let table = A.table mapping in + let is_root = ImportDataTypes.Table.equal table conf.source in + ImportSQL.Db.clear_duplicates ~is_root db table (A.keys mapping) ~f:(fun values -> let line = match snd @@ Array.get values 0 with diff --git a/lib/sql/db.ml b/lib/sql/db.ml index f2a2653..1d92a2e 100644 --- a/lib/sql/db.ml +++ b/lib/sql/db.ml @@ -286,17 +286,6 @@ let create_view : Sqlite3.db -> ImporterSyntax.t -> unit T.result = Ok () -(* - let query, _ = ImportAnalyser.Query.select output in - - let query = - { query with q = Printf.sprintf "CREATE VIEW result AS %s" query.q } - in - let** statement = execute_query db query in - - Sqlite3.step statement |> T.to_result - *) - let check_foreign : f:((string * ImportDataTypes.Value.t) array -> unit) -> Sqlite3.db -> @@ -316,66 +305,118 @@ let check_foreign : |> T.to_result let clear_duplicates : + is_root:bool -> f:((string * ImportDataTypes.Value.t) array -> unit) -> 'a t -> ImportDataTypes.Table.t -> ImportAnalyser.Dependency.key list -> unit T.result = - fun ~f db table keys -> + fun ~is_root ~f db table keys -> let table_name = Table.name table in - List.fold_left keys ~init:(Ok ()) - ~f:(fun state { ImportAnalyser.Dependency.name; _ } -> - let** _ = state in - - let select_query = - String.concat ~sep:"" - [ - "SELECT '"; - table_name; - "'.id, '"; - table_name; - "'.'key_"; - name; - "', '"; - name; - "' FROM '"; - table_name; - "' INNER JOIN (SELECT id, row_number() OVER(PARTITION BY '"; - table_name; - "'.'key_"; - name; - "' ORDER BY (id)) AS row_num from '"; - table_name; - "') other_table WHERE other_table.row_num <> 1 and '"; - table_name; - "'.id = other_table.id"; - ] - in - let stmt = Sqlite3.prepare db select_query in + let keys_name = + List.map keys ~f:(fun ImportAnalyser.Dependency.{ name; filters; _ } -> + ( name, + filters, + String.concat ~sep:"" [ "'"; table_name; "'.'key_"; name; "'" ] )) + in - ignore - @@ Sqlite3.iter stmt ~f:(fun data -> - let values = - Array.mapi data ~f:(fun i value -> - (Sqlite3.column_name stmt i, T.to_datatype value)) - in - f values); + let** _ = + List.fold_left keys_name ~init:(Ok ()) + ~f:(fun state (name, filters, key_name) -> + let table_external = + ImportDataTypes.Table.{ file = ""; tab = 0; name = table_name } + in - let delete_query = - Printf.sprintf - {|UPDATE '%s' -SET key_%s = NULL + (* If there are filters to apply in the external, prepare the predicates now *) + let join_buffer = ImportAnalyser.Chunk.create () in + let () = + match filters with + | [] -> () + | _ -> + ImportAnalyser.Chunk.add_string join_buffer " WHERE "; + ImportAnalyser.Chunk.add_expressions ~sep:"\nAND " + ~repr:(fun formatter column -> + Format.fprintf formatter "%s" + (Table.print_column table_external + ("col_" ^ string_of_int column))) + join_buffer filters + in + let values = + Queue.to_seq join_buffer.parameters + |> Seq.map (fun v -> T.of_datatype v) + |> List.of_seq + in + + let** _ = state in + + let select_query = + String.concat ~sep:"" + [ + "SELECT '"; + table_name; + "'.id, "; + key_name; + ", '"; + name; + "' FROM '"; + table_name; + "' INNER JOIN (SELECT id, row_number() OVER(PARTITION BY "; + key_name; + " ORDER BY (id)) AS row_num from '"; + table_name; + "'"; + Buffer.contents join_buffer.b; + ") other_table WHERE other_table.row_num <> 1 AND \ + 'other_table'.id = "; + Table.print_column table_external "id"; + ] + in + let stmt = Sqlite3.prepare db select_query in + let* _ = Sqlite3.bind_values stmt values in + + ignore + @@ Sqlite3.iter stmt ~f:(fun data -> + let values = + Array.mapi data ~f:(fun i value -> + (Sqlite3.column_name stmt i, T.to_datatype value)) + in + f values); + + let clear_query = + Printf.sprintf + {|UPDATE '%s' +SET 'key_%s' = NULL FROM ( - SELECT id, row_number() OVER(PARTITION BY key_%s ORDER BY (id)) AS row_num + SELECT id, row_number() OVER(PARTITION BY %s ORDER BY (id)) AS row_num, * from '%s' -) other_table +%s) other_table WHERE other_table.row_num <> 1 and '%s'.id = other_table.id|} - table_name name name table_name table_name - in + table_name name key_name table_name + (Buffer.contents join_buffer.b) + table_name + in - Sqlite3.exec db delete_query |> T.to_result) + let stmt = Sqlite3.prepare db clear_query in + let* _ = Sqlite3.bind_values stmt values in + let* _ = Sqlite3.step stmt in + Result.ok ()) + in + + (* Now remove from the database all the line having ALL the keys to null *) + match (is_root, keys_name) with + | true, _ | _, [] -> Result.ok () + | _ -> + let predicates = + List.map keys_name ~f:(fun (_, _, key) -> key ^ " IS NULL") + |> String.concat ~sep:" AND " + in + let delete_query = + Printf.sprintf {|DELETE FROM '%s' WHERE %s|} table_name predicates + in + let** _ = Sqlite3.exec db delete_query |> T.to_result in + Result.ok () type 'a result = ('a, exn) Result.t diff --git a/lib/sql/db.mli b/lib/sql/db.mli index 213fb27..b73479b 100644 --- a/lib/sql/db.mli +++ b/lib/sql/db.mli @@ -84,12 +84,16 @@ val check_foreign : unit result val clear_duplicates : + is_root:bool -> f:((string * ImportDataTypes.Value.t) array -> unit) -> 'a t -> ImportDataTypes.Table.t -> ImportAnalyser.Dependency.key list -> unit result -(** Remove all duplicated keys in the table by setting them to NULL. *) +(** Remove all duplicated keys in the table by setting them to NULL. + + This function will check each key referenced used in the table, and will + process each key separately.*) val insert_header : 'a t -> diff --git a/lib/sql/hashs.ml b/lib/sql/hashs.ml index eced2b4..4df8ca1 100644 --- a/lib/sql/hashs.ml +++ b/lib/sql/hashs.ml @@ -5,7 +5,6 @@ before inserting the values. *) open StdLabels -module Table = ImportDataTypes.Table let ( let* ) = Result.bind @@ -21,16 +20,21 @@ let evaluate : ImportAnalyser.Dependency.t -> int = (* Extract all the references to this table *) let keys = List.map (ImportAnalyser.Dependency.keys table) - ~f:(fun ImportAnalyser.Dependency.{ name; columns; expression } -> + ~f:(fun + ImportAnalyser.Dependency.{ name; columns; expression; filters } -> + (* It’s better to explicitly ignore the fields we want to exclude than + using the pattern _ in the fuction definition. + + This way, adding a new field in the type will raise a compilation error. *) ignore columns; - (name, expression)) + (name, expression, filters)) in Hashtbl.hash keys let insert : 'a T.t -> ImportAnalyser.Dependency.t -> unit T.result = fun db table -> let source = ImportAnalyser.Dependency.table table in - let table_name = Table.name source in + let table_name = ImportDataTypes.Table.name source in let hash = evaluate table in @@ -56,7 +60,7 @@ let insert : 'a T.t -> ImportAnalyser.Dependency.t -> unit T.result = let query : 'a T.t -> ImportDataTypes.Table.t -> int option T.result = fun db table -> - let table_name = Table.name table in + let table_name = ImportDataTypes.Table.name table in let query = String.concat ~sep:"" [ "SELECT hash FROM 'hashes' WHERE hashes.'table' = '"; table_name; "'" ] diff --git a/lib/syntax/importerSyntax.ml b/lib/syntax/importerSyntax.ml index cfbba81..d91db09 100644 --- a/lib/syntax/importerSyntax.ml +++ b/lib/syntax/importerSyntax.ml @@ -14,13 +14,18 @@ let toml_of_table Table.{ file; tab; name } = Otoml.table values +let repr_expression_list ~repr l = + Otoml.array + (List.map l ~f:(fun v -> + Otoml.string (ImportExpression.Repr.repr ~top:true repr v))) + module Extern = struct type t = { intern_key : Path.t E.t; target : Table.t; extern_key : Path.column E.t; allow_missing : bool; - match_rule : string option; + filters : ImportDataTypes.Path.column ImportExpression.T.t list; } [@@deriving show, eq] (** Describe a relation beteween two tables *) @@ -38,6 +43,10 @@ module Extern = struct extern.extern_key ); ("file", Otoml.string extern.target.file); ("allow_missing", Otoml.boolean extern.allow_missing); + ( "filters", + repr_expression_list + ~repr:(fun s -> ":" ^ Path.column_to_string s) + extern.filters ); ] in @@ -66,19 +75,13 @@ type t = { } let repr t = - let repr_expression_list l = - Otoml.array - (List.map l ~f:(fun v -> - Otoml.string (ImportExpression.Repr.repr ~top:true Path.show v))) - in - let sheet = Otoml.table [ - ("columns", repr_expression_list t.columns); - ("filters", repr_expression_list t.filters); - ("sort", repr_expression_list t.sort); - ("uniq", repr_expression_list t.uniq); + ("columns", repr_expression_list ~repr:Path.show t.columns); + ("filters", repr_expression_list ~repr:Path.show t.filters); + ("sort", repr_expression_list ~repr:Path.show t.sort); + ("uniq", repr_expression_list ~repr:Path.show t.uniq); ] in diff --git a/lib/syntax/importerSyntax.mli b/lib/syntax/importerSyntax.mli index 49b7364..47dabe6 100644 --- a/lib/syntax/importerSyntax.mli +++ b/lib/syntax/importerSyntax.mli @@ -4,7 +4,7 @@ module Extern : sig target : ImportDataTypes.Table.t; extern_key : ImportDataTypes.Path.column ImportExpression.T.t; allow_missing : bool; - match_rule : string option; + filters : ImportDataTypes.Path.column ImportExpression.T.t list; } [@@deriving show, eq] end @@ -205,6 +205,13 @@ allow_missing Cette clef optionnelle indique s’il faut autoriser les valeurs manquantes lors dans une liaison. Déclarer `allow_missing` va également autoriser les doublons sur les valeurs de clef. +filters + Il s’agit d’une liste d’expression devant être validées pour que la ligne + soit prise en compte. + + .. include:: examples/externals_filters.toml + :code: toml + :class: collapse Une fois la dépendance vers un fichier externe déclaré, il est possible d’utiliser les colonnes de ce fichier en les référençant directement. Cela @@ -337,6 +344,10 @@ résultat selon que l’on filtre : colonne B ≠ 0 2. d’abord sur la colonne B ≠ 0, puis la plus grande valeur de la colonne A. +.. include:: examples/importer_groupe.toml + :code: toml + :class: collapse + Exemple complet =============== @@ -402,10 +413,13 @@ partir de plusieurs tables. # La table des immatriculations. # Seules les lignes avec la colonne D = 4 nous intéressent [externals.immat] - intern_key = ":A ^ '_4'" + intern_key = ":A" file = "20220222_SO_IMMATRICULATION.xlsx" - extern_key = ":B ^ '_' ^ :D" + extern_key = ":B" allow_missing = true + filters = [ + ":D = 4" + ] [sheet] columns = [ diff --git a/tests/analyser_dependency.ml b/tests/analyser_dependency.ml index 511b706..00f21d7 100644 --- a/tests/analyser_dependency.ml +++ b/tests/analyser_dependency.ml @@ -40,6 +40,7 @@ let test_keys = name = "other"; expression = Expression.Path 3; columns = lazy (Cont.IntSet.singleton 3); + filters = []; }; ] in @@ -61,6 +62,7 @@ let test_keys_missing = name = "last_file"; expression = Expression.Path 3; columns = lazy (Cont.IntSet.singleton 3); + filters = []; }; ] in @@ -145,14 +147,14 @@ let test_unlinked = target = { file = "other.xlsx"; tab = 1; name = "circular" }; extern_key = Path 3; allow_missing = true; - match_rule = None; + filters = []; }; { intern_key = Path { alias = Some "circular"; column = 1 }; target = { file = "other2.xlsx"; tab = 1; name = "circular2" }; extern_key = Path 3; allow_missing = true; - match_rule = None; + filters = []; }; ]; columns = []; @@ -177,7 +179,7 @@ let conf_with_unlinked = target = { file = "other.xlsx"; tab = 1; name = "other" }; extern_key = Path 3; allow_missing = false; - match_rule = None; + filters = []; }; ]; columns = diff --git a/tests/analyser_filters.ml b/tests/analyser_filters.ml index ef51e0c..9a54bde 100644 --- a/tests/analyser_filters.ml +++ b/tests/analyser_filters.ml @@ -80,9 +80,9 @@ let group_filter () = let expected_links = Chunk.create () in Chunk.add_string expected_links - "WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_1') OVER \ + "WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_1') OVER \ (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN \ - UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1 AS group_function\n\ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1) AS group_function\n\ FROM 'source' AS 'source')\n"; Alcotest.(check @@ pair Test_migration.chunk Test_migration.chunk) @@ -106,9 +106,9 @@ let expression_with_group () = let expected_links = Chunk.create () in Chunk.add_string expected_links - "WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_1') OVER \ + "WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_1') OVER \ (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN \ - UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1 AS group_function\n\ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1) AS group_function\n\ FROM 'source' AS 'source' WHERE (1=COALESCE('source'.'col_1',0)))\n"; Alcotest.(check @@ pair Test_migration.chunk Test_migration.chunk) @@ -134,9 +134,9 @@ let group_with_expression () = let expected_links = Chunk.create () in Chunk.add_string expected_links - "WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_1') OVER \ + "WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_1') OVER \ (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN \ - UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1 AS group_function\n\ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1) AS group_function\n\ FROM 'source' AS 'source')\n"; Alcotest.(check @@ pair Test_migration.chunk Test_migration.chunk) @@ -161,13 +161,13 @@ let group_with_group () = let expected_links = Chunk.create () in Chunk.add_string expected_links - "WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_1') OVER \ + "WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_1') OVER \ (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN \ - UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1 AS group_function\n\ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1) AS group_function\n\ FROM 'source' AS 'source')\n\ - , filter1 AS (SELECT source.id, LAST_VALUE('source'.'col_1') OVER \ + , filter1 AS (SELECT source.id, (LAST_VALUE('source'.'col_1') OVER \ (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN \ - UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1 AS group_function\n\ + UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=1) AS group_function\n\ FROM 'source' AS 'source'\n\ INNER JOIN 'filter0' ON filter0.id = source.id\n\ WHERE filter0.group_function)\n"; diff --git a/tests/analyser_query_test.ml b/tests/analyser_query_test.ml index ed89623..37a748b 100644 --- a/tests/analyser_query_test.ml +++ b/tests/analyser_query_test.ml @@ -46,11 +46,11 @@ let check_externals = let query = Q.check_external conf (List.hd conf.externals) in let expected_query = - "SELECT 'source'.'id', 'source'.'col_1'\n\ + "SELECT 'source'.'id', ('source'.'col_1')\n\ FROM 'source' AS 'source'\n\ LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.'col_1')) = \ 'other'.'key_other' WHERE 'other'.'key_other' IS NULL AND \ - 'source'.'col_1' IS NOT NULL AND 'source'.'col_1' <> ''" + ('source'.'col_1') IS NOT NULL AND ('source'.'col_1') <> ''" in Alcotest.check Alcotest.string "" expected_query query.q @@ -179,6 +179,7 @@ let prepare_insert = name = "key_test"; expression = Concat [ Path 1; Literal "_"; Empty ]; columns = lazy (ImportContainers.IntSet.singleton 1); + filters = []; } in @@ -207,7 +208,7 @@ let filter_group = let contents, _ = ImportAnalyser.Query.select conf in let expected = - {|WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_3') OVER (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS group_function + {|WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_3') OVER (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)) AS group_function FROM 'source' AS 'source' LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.'col_1')) = 'other'.'key_other' LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.'col_1')) = 'last_file'.'key_last_file') @@ -239,7 +240,7 @@ let filter_group2 = let contents, _ = ImportAnalyser.Query.select conf in let expected = - {|WITH filter0 AS (SELECT source.id, LAST_VALUE('source'.'col_3') OVER (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS group_function + {|WITH filter0 AS (SELECT source.id, (LAST_VALUE('source'.'col_3') OVER (PARTITION BY 'source'.'col_1' ORDER BY 'source'.'col_1' RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)) AS group_function FROM 'source' AS 'source' LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.'col_1')) = 'other'.'key_other' LEFT JOIN 'last' AS 'last_file' ON rtrim(upper('other'.'col_1')) = 'last_file'.'key_last_file') @@ -253,6 +254,89 @@ WHERE (COALESCE('source'.'col_3',0)=0) AND filter0.group_function|} Alcotest.check Alcotest.string "" expected contents.q +(** Add an external with a filter. Ensure the predicate is reported in the + query. *) +let external_filter = + "external_filter" >:: fun _ -> + let conf = + Syntax. + { + ConfLoader.conf with + externals = + [ + { + ConfLoader.external_other with + filters = [ Expression_builder.(equal (path 1) integer_one) ]; + }; + ]; + columns = [ Expression_builder.empty ]; + } + in + + let query, _ = ImportAnalyser.Query.select conf in + let expected_query = + {|SELECT '' AS result_0 +FROM 'source' AS 'source' +LEFT JOIN 'other' AS 'other' ON rtrim(upper('source'.'col_1')) = 'other'.'key_other' AND (COALESCE('other'.'col_1',0)=1)|} + in + Alcotest.check Test_migration.trimed_string "" expected_query query.q + +let order_by = + "order_by" >:: fun () -> + let conf = + Syntax. + { + ConfLoader.conf with + externals = []; + columns = + [ + Expression_builder.path + ImportDataTypes.Path.{ alias = None; column = 1 }; + ]; + sort = + [ + Expression_builder.path + ImportDataTypes.Path.{ alias = None; column = 1 }; + Expression_builder.integer_one; + ]; + } + in + let query, _ = ImportAnalyser.Query.select conf in + let expected_query = + {|SELECT 'source'.'col_1' AS result_0 +FROM 'source' AS 'source' +ORDER BY ('source'.'col_1'), (1)|} + in + Alcotest.check Test_migration.trimed_string "" expected_query query.q + +let group_by = + "order_by" >:: fun () -> + let conf = + Syntax. + { + ConfLoader.conf with + externals = []; + columns = + [ + Expression_builder.path + ImportDataTypes.Path.{ alias = None; column = 1 }; + ]; + uniq = + [ + Expression_builder.path + ImportDataTypes.Path.{ alias = None; column = 1 }; + Expression_builder.integer_one; + ]; + } + in + let query, _ = ImportAnalyser.Query.select conf in + let expected_query = + {|SELECT 'source'.'col_1' AS result_0 +FROM 'source' AS 'source' +GROUP BY ('source'.'col_1'), (1)|} + in + Alcotest.check Test_migration.trimed_string "" expected_query query.q + let test_suit = [ create_table; @@ -265,6 +349,9 @@ let test_suit = prepare_insert; filter_group; filter_group2; + external_filter; + order_by; + group_by; ] let tests = "analyser_query_test" >::: test_suit diff --git a/tests/confLoader.ml b/tests/confLoader.ml index e6187c3..b0be690 100644 --- a/tests/confLoader.ml +++ b/tests/confLoader.ml @@ -61,7 +61,7 @@ let external_other = target = external_table_other; extern_key = Path 3; allow_missing = false; - match_rule = None; + filters = []; } let external_table_last = @@ -74,5 +74,5 @@ let external_last = target = external_table_last; extern_key = Path 3; allow_missing = true; - match_rule = None; + filters = []; } diff --git a/tests/configuration_toml.ml b/tests/configuration_toml.ml index 470af4a..620a106 100644 --- a/tests/configuration_toml.ml +++ b/tests/configuration_toml.ml @@ -303,6 +303,40 @@ columns = []|} Alcotest.(check (result Test_migration.syntax string)) "Dataset with alias" expected configuration +let external_filters () = + let configuration = + ConfLoader.load' + {|[source] +name = "" +file = "" +tab = 0 + +[externals.other] + intern_key = ":A" + file = "other.xlsx" + extern_key = ":C" + filters = [":B = 1"] + + +[sheet] +columns = []|} + and expected = + Ok + { + ImporterSyntax.dummy_conf with + externals = + ConfLoader. + [ + { + external_other with + filters = [ Expression_builder.(equal (path 2) integer_one) ]; + }; + ]; + } + in + Alcotest.(check (result Test_migration.syntax string)) + "Filters in external" expected configuration + let test_suit = [ ( "parse_extern" >:: fun _ -> @@ -322,7 +356,7 @@ let test_suit = Path { alias = None; column = 1 }; Path { alias = None; column = 2 }; ] ); - match_rule = None; + filters = []; allow_missing = true; } in @@ -372,6 +406,7 @@ let test_suit = ("Empty dataset", `Quick, empty_dataset); ("Dataset with invalid key", `Quick, dataset_with_invalid_key); ("External dataset", `Quick, external_dataset); + ("External with filter", `Quick, external_filters); ] let tests = "configuration_toml" >::: test_suit |