From 6b377719c10d5ab3343fd5221f99a4a21008e25a Mon Sep 17 00:00:00 2001 From: Sébastien Dailly Date: Thu, 14 Mar 2024 08:26:58 +0100 Subject: Initial commit --- lib/analysers/dependency.ml | 256 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 lib/analysers/dependency.ml (limited to 'lib/analysers/dependency.ml') diff --git a/lib/analysers/dependency.ml b/lib/analysers/dependency.ml new file mode 100644 index 0000000..e81cc49 --- /dev/null +++ b/lib/analysers/dependency.ml @@ -0,0 +1,256 @@ +open StdLabels +module IntSet = ImportContainers.IntSet +module Syntax = ImportConf.Syntax +module Table = ImportDataTypes.Table +module Path = ImportDataTypes.Path +module Expression = ImportExpression.T + +(* + Internally, the dependency chain is represented as a graph. + + Each file to process (csv or xlsx) is marked as a node, The link between + each node cannot be represented in the graph because each index can imply + multiple tables together (for exemple [join(source.A, other.B)] ). So the + graph is only used in order to get the import process order, and each + information is keeped in a separate table. + +*) + +type deps = (ImportContainers.Source.t * ImportContainers.Source.t list) list + +type key = { + name : string; + expression : Path.column Expression.t; + columns : ImportContainers.IntSet.t Lazy.t; +} + +type t = { + table : Table.t; + columns : IntSet.t; + keys : key list; +} + +let table t = t.table +let columns t = t.columns +let keys t = t.keys + +type build_map = t ImportContainers.Externals.t + +(* The expression can be qualified with full path (when we are in the column + definition) or only with a column (in a binding with an externanl table). + + This type is here to extract the correct information, according to the type + we are dealing with : + + - [to_mapping] allow to pick the right Set in which we need to add the + column pointed by the path (this can be [keys] or [columns] + - [of_path] is need to extract the qualified source from any kind of path. +*) +type 'a expression_extractor = { + to_mapping : t -> Path.column -> t; + of_path : 'a -> string option * Path.column; +} + +(** [add_path_in_map f parent path ] Extract the column from element [path] and + process the column in the function [f] + + The [path] is abstract, but the function [f.of_path] can extract the needed + elements in order to add it in the mapping. + + The function may raise [Unknown_source] if the the path describe an unknown + table. *) +let add_path_in_map : + f:'a expression_extractor -> conf:Syntax.t -> 'a -> build_map -> build_map = + fun ~f ~conf path map -> + let table_source, column = f.of_path path in + let table = + try ImportConf.get_table_for_name conf table_source with + | Not_found -> raise (ImportErrors.Unknown_source (Option.get table_source)) + in + + ImportContainers.Externals.update map + ~key:(ImportContainers.KeyName.from_table table) ~f:(fun v -> + let mapping = + match v with + | None -> raise (ImportErrors.Unknown_source table.name) + | Some mapping -> mapping + in + + Some (f.to_mapping mapping column)) + +let add_expression_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t -> + build_map -> + build_map = + fun ~f ~conf expr map -> + Expression.fold_values expr ~init:map ~f:(fun map p -> + add_path_in_map ~f ~conf p map) + +let add_columns_in_map : + f:'a expression_extractor -> + conf:Syntax.t -> + 'a Expression.t list -> + build_map -> + build_map = + fun ~f ~conf columns map -> + let columns = + List.fold_left columns ~init:map ~f:(fun map expression -> + let new_map = add_expression_in_map ~f ~conf expression map in + new_map) + in + columns + +(* [add_dependancies ~conf source map path] + add the dependancy from the table [source] to another one after analysing the + expression and extracting the path contained inside. + + This function is called for each path declared inside the expression. *) +let add_dependancies : conf:Syntax.t -> Syntax.extern -> deps -> Path.t -> deps + = + fun ~conf extern graph path -> + let source_table = ImportConf.get_table_for_name conf path.Path.alias in + + let source = ImportContainers.Source.from_table source_table in + let target = ImportContainers.Source.from_table extern.target in + + match ImportContainers.Source.equal target source with + | true -> graph + | _ -> (target, [ source ]) :: graph + +let add_external_in_map : + conf:Syntax.t -> Syntax.extern -> build_map * deps -> build_map * deps = + fun ~conf extern (map, graph) -> + let dest = ImportContainers.KeyName.from_table extern.target in + (* Pre-check that every source is already declared in the configuration. *) + let _ = + Expression.fold_values extern.intern_key ~init:() ~f:(fun () path -> + try + let _ = ImportConf.get_table_for_name conf path.Path.alias in + () + with + | Not_found -> ( + match path.alias with + | Some table -> raise (ImportErrors.Unknown_source table) + | None -> + (* This is very unlikely. This error would be raised if we have + no source for this import *) + let root = conf.source in + raise (ImportErrors.Unknown_source root.name))) + in + + (* Create the new key with all the expression and all the columns inside it *) + let new_key = + { + name = extern.target.Table.name; + expression = extern.extern_key; + columns = + lazy + (Expression.fold_values extern.extern_key + ~f:(fun acc k -> ImportContainers.IntSet.add k acc) + ~init:ImportContainers.IntSet.empty); + } + in + let build_map = + ImportContainers.Externals.update map ~key:dest ~f:(function + | None -> + (* Create the entry for the external if it does not exists *) + Some + { + table = extern.target; + columns = IntSet.empty; + keys = [ new_key ]; + } + | Some mapping -> + (* Or update the existing one with the key we just created *) + Some { mapping with keys = new_key :: mapping.keys }) + in + let graph = + Expression.fold_values extern.intern_key ~init:graph + ~f:(add_dependancies ~conf extern) + in + let build_map = + add_expression_in_map extern.intern_key build_map ~conf + ~f: + { + of_path = + (fun Path.{ alias; column } -> + let table = ImportConf.get_table_for_name conf alias in + (Some table.name, column)); + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + } + in + (build_map, graph) + +let mapper = + { + to_mapping = + (fun mapping column -> + { mapping with columns = IntSet.add column mapping.columns }); + of_path = (fun ({ alias; column } : Path.t) -> (alias, column)); + } + +let get_mapping : Syntax.t -> build_map * deps = + fun conf -> + let root = ImportContainers.Source.from_table (ImportConf.root_table conf) + and root' = + ImportContainers.KeyName.from_table (ImportConf.root_table conf) + in + let graph = [ (root, []) ] in + + (* For each external declared in the configuration file, add the columns to + query *) + let init = + ( ImportContainers.Externals.singleton root' + { + table = ImportConf.root_table conf; + columns = IntSet.empty; + keys = []; + }, + graph ) + in + let map, graph = + List.fold_left conf.externals ~init ~f:(fun map extern -> + add_external_in_map ~conf extern map) + in + + (* Now we don’t bother anymore with the graph and it’s dependency, we just + collect all the columns in the differents expressions *) + let map = + map + |> add_columns_in_map ~conf ~f:mapper conf.columns + |> add_columns_in_map ~conf ~f:mapper conf.sort + |> add_columns_in_map ~conf ~f:mapper conf.filters + |> add_columns_in_map ~conf ~f:mapper conf.uniq + in + (map, graph) + +let get_process_order : Syntax.t -> t list = + fun map -> + let map, graph = get_mapping map in + + match Tsort.sort graph with + | Tsort.ErrorCycle l -> + let name_of_key k = + ImportContainers.(Externals.find (Source.name k) map).table.name + in + raise (ImportErrors.Cycle (List.map ~f:name_of_key l)) + | Sorted elements -> + (* It’s OK, we know there is no cycles now, we can extract the files to + load from this list. + *) + List.filter_map elements ~f:(fun v -> + ImportContainers.Externals.find_opt + (ImportContainers.Source.name v) + map) + (* This list can still have duplicated values, and we have to remove them + still keeping the order. + *) + |> List.fold_left ~init:[] ~f:(fun acc element -> + (* Prevent the same file to beeing loaded twice *) + match List.mem element ~set:acc with + | true -> acc + | false -> element :: acc) -- cgit v1.2.3