diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md index a75dc8599149c..0765749817d19 100644 --- a/datahub-web-react/README.md +++ b/datahub-web-react/README.md @@ -1,4 +1,4 @@ -# DataHub React App (Incubating) +# DataHub React App ## About This module contains a React version of the DataHub UI, which is currently under incubation. Notice that this diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index d571b65ea3d96..4150a4ddd72e0 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -55,6 +55,7 @@ function list_markdown_files(): string[] { /^metadata-ingestion-examples\//, /^docs\/rfc\/templates\/000-template\.md$/, /^docs\/docker\/README\.md/, // This one is just a pointer to another file. + /^docs\/README\.md/, // This one is just a pointer to the hosted docs site. ]; const markdown_files = all_markdown_files.filter((filepath) => { @@ -75,7 +76,6 @@ function get_id(filepath: string): string { const hardcoded_slugs = { "README.md": "/", - "docs/README.md": "docs/overview", }; function get_slug(filepath: string): string { @@ -100,13 +100,19 @@ const hardcoded_titles = { "docs/demo.md": "Demo", }; +const hardcoded_descriptions = { + // Only applied if title is also overridden. + "README.md": + "DataHub is a data discovery application built on an extensible metadata platform that helps you tame the complexity of diverse data ecosystems.", +}; + // FIXME: Eventually, we'd like to fix all of the broken links within these files. const allowed_broken_links = [ - "docs/architecture/metadata-serving.md", "docs/developers.md", "docs/how/customize-elasticsearch-query-template.md", "docs/how/graph-onboarding.md", "docs/how/search-onboarding.md", + "docs/how/build-metadata-service.md", ]; function markdown_guess_title( @@ -120,6 +126,9 @@ function markdown_guess_title( let title: string; if (filepath in hardcoded_titles) { title = hardcoded_titles[filepath]; + if (filepath in hardcoded_descriptions) { + contents.data.description = hardcoded_descriptions[filepath]; + } } else { // Find first h1 header and use it as the title. const headers = contents.content.match(/^# (.+)$/gm); diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 7ef80b4a8a1b1..e3bdc0b885e9e 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -42,12 +42,12 @@ module.exports = { // TODO "docs/how/data-source-onboarding", ], Architecture: [ - // "docs/README", "docs/architecture/architecture", "docs/architecture/metadata-ingestion", - "docs/what/gma", + //"docs/what/gma", "docs/architecture/metadata-serving", - "docs/what/gms", + //"docs/what/gms", + "datahub-web-react/README", ], // }, // developerGuideSidebar: { @@ -69,6 +69,7 @@ module.exports = { "docs/what/graph", "docs/what/search-index", "docs/how/add-new-aspect", + "docs/how/build-metadata-service", "docs/how/customize-elasticsearch-query-template", "docs/how/entity-onboarding", "docs/how/graph-onboarding", diff --git a/docs/README.md b/docs/README.md index fd9dbe8d0575b..4a386790f9ff4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,28 +1 @@ -# Introduction - -DataHub is LinkedIn's generalized metadata search & discovery tool. To learn more about DataHub, check out our -[LinkedIn blog post](https://engineering.linkedin.com/blog/2019/data-hub) and [Strata presentation](https://speakerdeck.com/shirshanka/the-evolution-of-metadata-linkedins-journey-strata-nyc-2019). You should also visit [DataHub Architecture](architecture/architecture.md) to get a better understanding of how DataHub is implemented and [DataHub Onboarding Guide](how/entity-onboarding.md) to understand how to extend DataHub for your own use case. - -In general, Datahub has two types of users in mind. One has metadata, and use tools provided by Datahub to ingest metadata into Datahub; The other is to use Datahub to discover metadatas available within Datahub. Datahub provides intuitive UI, full text search capablitity, and graph relationship presentation to make the metadata discover and understanding much eaiser. - -The following sequence diagram highlights the key features Datahub has, and how the two types of users - metadata ingestion engineers and metadata discover users, can take full advantage of the Datahub. - -![datahub-sequence-diagram](imgs/datahub-sequence-diagram.png) -1. It starts with ingesting your metadata into datahub. We provide a [collection of sample Python scripts](https://github.com/linkedin/datahub/tree/master/metadata-ingestion) for you. Those scripts work with the popular relationship databases, find metadata of the data source, and publish metadata in Avro data format to MetadataChangeEvent(MCE) Kafka topic. -2. A MetadataChangeEvent (MCE) processor consumes Kafka message with given topic, and make necessary transformation, send to Generalized Metadata Service (GMS), and GMS persists the metadata to a relational database of your choice. Currently we support MySQL, PostgreSQL and MariaDB. -3. GMS also checks the received metadata to find out whether there is a previous version. If so, it will publish the difference to Kafka’s MetadataAuditEvent (MAE) topic. -4. MAE processor consumes MetadataAuditEvent message from Kafka, and persist to Neo4j & Elastic Search (ES). -5. The frontend of Datahub talks to the metadata restful API services of GMS. The metadata discovering users can browse, search metadatas, get the details of metadata such as the owner, the lineage and other customer tags. - - -## Documentation -* [DataHub Developer's Guide](developers.md) -* [DataHub Architecture](architecture/architecture.md) -* [DataHub Onboarding Guide](how/entity-onboarding.md) -* [Docker Images](../docker) -* [Frontend](../datahub-frontend) -* [Web App](../datahub-web) -* [Generalized Metadata Service](../gms) -* [Metadata Ingestion](../metadata-ingestion) -* [Metadata Processing Jobs](../metadata-jobs) -* [The RFC Process](rfc.md) +DataHub's project documentation is hosted at [datahubproject.io](https://datahubproject.io/docs) diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 549f4a0470eb1..012825ce230e2 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -1,11 +1,39 @@ # DataHub Architecture Overview -![datahub-architecture](../imgs/datahub-architecture.svg) -## Generalized Metadata Architecture (GMA) -Refer to [GMA](../what/gma.md). +We highly recommend that you read the excellent [metadata architectures blog post] that describes the three generations of metadata architectures, and goes into a +lot of detail around the motivations and evolution of the DataHub architecture in comparison with other data discovery solutions and catalogs. -## Metadata Serving -Refer to [metadata-serving](metadata-serving.md). +The figure below describes the high-level architecture of DataHub, a third-generation metadata platform. -## Metadata Ingestion -Refer to [metadata-ingestion](metadata-ingestion.md). +![datahub-architecture](../imgs/datahub-architecture.png) + +## The Components +The DataHub deployables are split into three components: + +### Ingestion +This component controls how metadata is integrated with DataHub. Read [datahub-ingestion] to learn more. + +### Serving +The component is responsible for storing and querying data within DataHub. Read [datahub-serving] to learn more. + +### Frontend +This is the user-facing application that powers search and discovery over the metadata graph. Read [react-frontend] to learn more. + + +## Architecture Highlights +There are three main highlights of DataHub's architecture. + +### Schema-first approach to Metadata Modeling +DataHub's metadata model is described using a [serialization agnostic language](https://linkedin.github.io/rest.li/pdl_schema). Both [REST](../../gms) and well as [GraphQL API-s](../../datahub-web-react/src/graphql) are supported. In addition, DataHub supports an [AVRO-based API](../../metadata-events) over Kafka to communicate metadata changes and subscribe to them. Our [roadmap](../roadmap.md) includes a milestone to support no-code metadata model edits very soon, which will allow for even more ease of use, while retaining all the benefits of a typed API. Read about metadata modeling at [metadata modeling]. +### Stream-based Real-time Metadata Platform +DataHub's metadata infrastructure is stream-oriented, which allows for changes in metadata to be communicated and reflected within the platform within seconds. You can also subscribe to changes happening in DataHub's metadata, allowing you to build real-time metadata-driven systems. For example, you can build an access-control system that can observe a previously world-readable dataset adding a new schema field which contains PII, and locks down that dataset for access control reviews. +### Federated Metadata Serving +DataHub comes with a single [metadata service (gms)](../../gms) as part of the open source repository. However, it also supports federated metadata services which can be owned and operated by different teams –– in fact that is how LinkedIn runs DataHub internally. The federated services communicate with the central search index and graph using Kafka, to support global search and discovery while still enabling decoupled ownership of metadata. This kind of architecture is very amenable for companies who are implementing [data mesh](https://martinfowler.com/articles/data-monolith-to-mesh.html). + + +[metadata modeling]: ../how/metadata-modelling.md +[PDL]: https://linkedin.github.io/rest.li/pdl_schema +[metadata architectures blog post]: https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained +[datahub-serving]: metadata-serving.md +[datahub-ingestion]: metadata-ingestion.md +[react-frontend]: ../../datahub-web-react/README.md diff --git a/docs/architecture/metadata-ingestion.md b/docs/architecture/metadata-ingestion.md index 3950665d5571e..72c703edd6b2e 100644 --- a/docs/architecture/metadata-ingestion.md +++ b/docs/architecture/metadata-ingestion.md @@ -1,70 +1,32 @@ # Metadata Ingestion Architecture -## MCE Consumer Job +DataHub supports an extremely flexible ingestion architecture that can support push, pull, asynchronous and synchronous models. +The figure below describes all the options possible for connecting your favorite system to DataHub. +![Ingestion Architecture](../imgs/ingestion-architecture.png) -Metadata providers communicate changes in metadata by emitting [MCE]s, which are consumed by a Kafka Streams job, [mce-consumer-job]. The [Python ingestion framework](../../metadata-ingestion/README.md) makes it easy to emit these MCEs. -The MCE consumer job converts the AVRO-based MCE into the equivalent [Pegasus Data Template] and saves it into the database by calling a special GMS ingest API. +## MCE: The Center Piece -## MAE Consumer Job +The center piece for ingestion is the [Metadata Change Event (MCE)] which represents a metadata change that is being communicated by an upstream system. +MCE-s can be sent over Kafka, for highly scalable async publishing from source systems. They can also be sent directly to the HTTP endpoint exposed by the DataHub service tier to get synchronous success / failure responses. -All the emitted [MAE] will be consumed by a Kafka Streams job, [mae-consumer-job], which updates the [graph] and [search index] accordingly. -The job itself is entity-agnostic and will execute corresponding graph & search index builders, which will be invoked by the job when a specific metadata aspect is changed. -The builder should instruct the job how to update the graph and search index based on the metadata change. -The builder can optionally use [Remote DAO] to fetch additional metadata from other sources to help compute the final update. +## Pull-based Integration -To ensure that metadata changes are processed in the correct chronological order, -MAEs are keyed by the entity [URN] — meaning all MAEs for a particular entity will be processed sequentially by a single Kafka streams thread. +DataHub ships with a Python based [metadata-ingestion system](../../metadata-ingestion/README.md) that can connect to different sources to pull metadata from them. This metadata is then pushed via Kafka or HTTP to the DataHub storage tier. Metadata ingestion pipelines can be [orchestrated by Airflow](../../metadata-ingestion/examples/airflow) to set up scheduled ingestion easily. If you don't find a source already supported, it is very easy to [write your own](../../metadata-ingestion/README.md#contributing). -## Search and Graph Index Builders +## Push-based Integration -As described in [Metadata Modelling] section, [Entity], [Relationship], and [Search Document] models do not directly encode the logic of how each field should be derived from metadata. -Instead, this logic should be provided in the form of a graph or search index builder. +As long as you can emit a [Metadata Change Event (MCE)] event to Kafka or make a REST call over HTTP, you can integrate any system with DataHub. For convenience, DataHub also provides simple [Python emitters] for you to integrate into your systems to emit metadata changes (MCE-s) at the point of origin. -The builders register the metadata [aspect]s of their interest against [MAE Consumer Job](#mae-consumer-job) and will be invoked whenever a MAE involving the corresponding aspect is received. -If the MAE itself doesn’t contain all the metadata needed, builders can use Remote DAO to fetch from GMS directly. +## Internal Components -```java -public abstract class BaseIndexBuilder { +### Applying MCE-s to DataHub Service Tier (mce-consumer) - BaseIndexBuilder(@Nonnull List> snapshotsInterested); +DataHub comes with a Kafka Streams based job, [mce-consumer-job], which consumes the MCE-s and converts them into the [equivalent Pegasus format] and sends it to the DataHub Service Tier (datahub-gms) using the `/ingest` endpoint. - @Nullable - public abstract List getDocumentsToUpdate(@Nonnull RecordTemplate snapshot); - - @Nonnull - public abstract Class getDocumentType(); -} -``` - -```java -public interface GraphBuilder { - GraphUpdates build(SNAPSHOT snapshot); - - @Value - class GraphUpdates { - List entities; - List relationshipUpdates; - } - - @Value - class RelationshipUpdates { - List relationships; - BaseGraphWriterDAO.RemovalOption preUpdateOperation; - } -} -``` - -[MCE]: ../what/mxe.md#metadata-change-event-mce +[Metadata Change Event (MCE)]: ../what/mxe.md#metadata-change-event-mce +[Metadata Audit Event (MAE)]: ../what/mxe.md#metadata-audit-event-mae [MAE]: ../what/mxe.md#metadata-audit-event-mae -[Pegasus Data Template]: https://linkedin.github.io/rest.li/how_data_is_represented_in_memory#the-data-template-layer -[graph]: ../what/graph.md -[search index]: ../what/search-index.md +[equivalent Pegasus format]: https://linkedin.github.io/rest.li/how_data_is_represented_in_memory#the-data-template-layer [mce-consumer-job]: ../../metadata-jobs/mce-consumer-job -[mae-consumer-job]: ../../metadata-jobs/mae-consumer-job -[Remote DAO]: ../architecture/metadata-serving.md#remote-dao -[URN]: ../what/urn.md -[Metadata Modelling]: ../how/metadata-modelling.md -[Entity]: ../what/entity.md -[Relationship]: ../what/relationship.md -[Search Document]: ../what/search-document.md -[Aspect]: ../what/aspect.md +[Python emitters]: ../../metadata-ingestion/README.md#using-as-a-library + diff --git a/docs/architecture/metadata-serving.md b/docs/architecture/metadata-serving.md index 26e85cada0cf0..411d094ca0c05 100644 --- a/docs/architecture/metadata-serving.md +++ b/docs/architecture/metadata-serving.md @@ -1,118 +1,42 @@ -# Metadata Serving Architecture +# DataHub Serving Architecture -This section describes how metadata is served in GMA. In particular, it demonstrates how GMA can efficiently service different types of queries, including key-value, complex queries, and full text search. -Below shows a high-level system diagram for the metadata serving architecture. +The figure below shows the high-level system diagram for DataHub's Serving Tier. -![metadata-serving](../imgs/metadata-serving.png) +![datahub-serving](../imgs/datahub-serving.png) -There are four types of Data Access Object ([DAO]) that standardize the way metadata is accessed. -This section describes each type of DAO, its purpose, and the interface. +The primary service is called [gms](../../gms) and exposes a REST API for performing CRUD operations on metadata. The metadata service also exposes search and graph query API-s to support secondary-index style queries, full-text search queries as well as relationship queries like lineage. In addition, the [datahub-frontend](../../datahub-frontend) and [datahub-gms-graphql](../../datahub-gms-graphql-service) services expose a GraphQL API on top of the metadata graph. -These DAOs rely heavily on [Java Generics](https://docs.oracle.com/javase/tutorial/extra/generics/index.html) so that the core logics can remain type-neutral. -However, as there’s no inheritance in [Pegasus], the generics often fallback to extending [RecordTemplate] instead of the desired types (i.e. [entity], [relationship], metadata [aspect] etc). Additional runtime type checking has been added to the DAOs to avoid binding to unexpected types. We also cache the type checking result to minimize runtime overhead. +## DataHub Serving Tier Components -## Key-value DAO (Local DAO) +### Metadata Storage -[GMS] use [Local DAO] to store and retrieve metadata [aspect]s from the local document store. -Below shows the base class and its simple key-value interface. -As the DAO is a generic class, it needs to be bound to specific type during instantiation. -Each entity type will need to instantiate its own version of DAO. +The DataHub Metadata Service (gms) persists metadata in a document store (could be an RDBMS like MySQL, Postgres or a key-value store like Couchbase etc.). -```java -public abstract class BaseLocalDAO { +### Metadata Commit Log Stream (MAE) - public abstract void - add(Class type, URN urn, METADATA value); +The DataHub Service Tier also emits a commit event [Metadata Audit Event (MAE)] when a metadata change has been successfully committed to persistent storage. This event is sent over Kafka. - public abstract - Optional get(Class type, URN urn, int version); +The MAE stream is a public API and can be subscribed to by external systems providing an extremely powerful way to react in real-time to changes happening in metadata. For example, you could build an access control enforcer that reacts to change in metadata (e.g. a previously world-readable dataset now has a pii field) to immediately lock down the dataset in question. +Note that not all MCE-s will result in an MAE, because the DataHub serving tier will ignore any duplicate changes to metadata. - public abstract - ListResult listVersions(Class type, URN urn, int start, - int pageSize); +### Metadata Index Applier (mae-consumer-job) - public abstract ListResult listUrns( - Class type, int start, int pageSize); +[MAE]-s are consumed by another Kafka Streams job, [mae-consumer-job], which applies the changes to the [graph] and [search index] accordingly. +The job is entity-agnostic and will execute corresponding graph & search index builders, which will be invoked by the job when a specific metadata aspect is changed. +The builder should instruct the job how to update the graph and search index based on the metadata change. +The builder can optionally use [Remote DAO] to fetch additional metadata from other sources to help compute the final update. - public abstract - ListResult list(Class type, URN urn, int start, int pageSize); -} -``` +To ensure that metadata changes are processed in the correct chronological order, MAEs are keyed by the entity [URN] — meaning all MAEs for a particular entity will be processed sequentially by a single Kafka streams thread. -Another important function of [Local DAO] is to automatically emit [MAE]s whenever the metadata is updated. -This is doable because MAE effectively use the same [Pegasus] models so [RecordTemplate] can be easily converted into the corresponding [GenericRecord]. +### Metadata Query Serving -## Search DAO +Primary-key based reads (e.g. getting schema metadata for a dataset based on the `dataset-urn`) on metadata are routed to the document store. Secondary index based reads on metadata are routed to the search index (or alternately can use the strongly consistent secondary index support described [here]()). Full-text and advanced search queries are routed to the search index. Complex graph queries usch as lineage are routed to the graph index. -Search DAO is also a generic class that can be bound to a specific type of search document. -The DAO provides 3 APIs: -* A `search` API that takes the search input, a [Filter], a [SortCriterion], some pagination parameters, and returns a [SearchResult]. -* An `autoComplete` API which allows typeahead-style autocomplete based on the current input and a [Filter], and returns [AutocompleteResult]. -* A `filter` API which allows for filtering only without a search input. It takes a a [Filter] and a [SortCriterion] as input and returns [SearchResult]. +### Further Reading -```java -public abstract class BaseSearchDAO { +Read the [metadata service developer guide](../how/build-metadata-service.md) to understand how to customize the DataHub metadata service tier. - public abstract SearchResult search(String input, Filter filter, - SortCriterion sortCriterion, int from, int size); - public abstract AutoCompleteResult autoComplete(String input, String field, - Filter filter, int limit); - - public abstract SearchResult filter(Filter filter, SortCriterion sortCriterion, - int from, int size); -} -``` - -## Query DAO - -Query DAO allows clients, e.g. [GMS](../what/gms.md), [MAE Consumer Job](metadata-ingestion.md#mae-consumer-job) etc, to perform both graph & non-graph queries against the metadata graph. -For instance, a GMS can use the Query DAO to find out “all the dataset owned by the users who is part of the group `foo` and report to `bar`,” which naturally translates to a graph query. -Alternatively, a client may wish to retrieve “all the datasets that stored under /jobs/metrics”, which doesn’t involve any graph traversal. - -Below is the base class for Query DAOs, which contains the `findEntities` and `findRelationships` methods. -Both methods also have two versions, one involves graph traversal, and the other doesn’t. -You can use `findMixedTypesEntities` and `findMixedTypesRelationships` for queries that return a mixture of different types of entities or relationships. -As these methods return a list of [RecordTemplate], callers will need to manually cast them back to the specific entity type using [isInstance()](https://docs.oracle.com/javase/8/docs/api/java/lang/Class.html#isInstance-java.lang.Object-) or reflection. - -Note that the generics (ENTITY, RELATIONSHIP) are purposely left untyped, as these types are native to the underlying graph DB and will most likely differ from one implementation to another. - -```java -public abstract class BaseQueryDAO { - - public abstract List findEntities( - Class type, Filter filter, int offset, int count); - - public abstract List findEntities( - Class type, Statement function); - - public abstract List findMixedTypesEntities(Statement function); - - public abstract List - findRelationships(Class entityType, Class relationshipType, Filter filter, int offset, int count); - - public abstract List - findRelationships(Class type, Statement function); - - public abstract List findMixedTypesRelationships( - Statement function); -} -``` - -## Remote DAO - -[Remote DAO] is nothing but a specialized readonly implementation of [Local DAO]. -Rather than retrieving metadata from a local storage, Remote DAO will fetch the metadata from another [GMS]. -The mapping between [entity] type and GMS is implemented as a hard-coded map. - -To prevent circular dependency ([rest.li] service depends on remote DAO, which in turn depends on rest.li client generated by each rest.li service), -Remote DAO will need to construct raw rest.li requests directly, instead of using each entity’s rest.li request builder. - - -[AutocompleteResult]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/AutoCompleteResult.pdl -[Filter]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/Filter.pdl -[SortCriterion]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/SortCriterion.pdl -[SearchResult]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/SearchResult.java [RecordTemplate]: https://github.com/linkedin/rest.li/blob/master/data/src/main/java/com/linkedin/data/template/RecordTemplate.java [GenericRecord]: https://github.com/apache/avro/blob/master/lang/java/avro/src/main/java/org/apache/avro/generic/GenericRecord.java [DAO]: https://en.wikipedia.org/wiki/Data_access_object @@ -121,7 +45,25 @@ Remote DAO will need to construct raw rest.li requests directly, instead of usin [entity]: ../what/entity.md [aspect]: ../what/aspect.md [GMS]: ../what/gms.md -[Local DAO]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/BaseLocalDAO.java -[Remote DAO]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/BaseRemoteDAO.java [MAE]: ../what/mxe.md#metadata-audit-event-mae [rest.li]: https://rest.li + + +[Metadata Change Event (MCE)]: ../what/mxe.md#metadata-change-event-mce +[Metadata Audit Event (MAE)]: ../what/mxe.md#metadata-audit-event-mae +[MAE]: ../what/mxe.md#metadata-audit-event-mae +[equivalent Pegasus format]: https://linkedin.github.io/rest.li/how_data_is_represented_in_memory#the-data-template-layer +[graph]: ../what/graph.md +[search index]: ../what/search-index.md +[mce-consumer-job]: ../../metadata-jobs/mce-consumer-job +[mae-consumer-job]: ../../metadata-jobs/mae-consumer-job +[Remote DAO]: ../architecture/metadata-serving.md#remote-dao +[URN]: ../what/urn.md +[Metadata Modelling]: ../how/metadata-modelling.md +[Entity]: ../what/entity.md +[Relationship]: ../what/relationship.md +[Search Document]: ../what/search-document.md +[metadata aspect]: ../what/aspect.md +[Python emitters]: https://datahubproject.io/docs/metadata-ingestion/#using-as-a-library + + diff --git a/docs/how/build-metadata-service.md b/docs/how/build-metadata-service.md new file mode 100644 index 0000000000000..313dda121e15a --- /dev/null +++ b/docs/how/build-metadata-service.md @@ -0,0 +1,190 @@ +# Metadata Service Developer Guide + +This guide assumes that you are already familar with the architecture of DataHub's Metadata Serving Layer, as described [here](../architecture/metadata-serving.md). + +Read on to understand how to build and extend the DataHub service tier for your specific needs. + + +## Using DAOs to store and query Metadata + +DataHub metadata service uses the excellent `datahub-gma` library to store and query metadata in a standard way. +There are four types of Data Access Objects ([DAO]) that standardize the way metadata is accessed. +This section describes each type of DAO, its purpose, and the interface. + +These DAOs rely heavily on [Java Generics](https://docs.oracle.com/javase/tutorial/extra/generics/index.html) so that the core logics can remain type-neutral. +However, as there’s no inheritance in [Pegasus], the generics often fallback to extending [RecordTemplate] instead of the desired types (i.e. [entity], [relationship], metadata [aspect] etc). Additional runtime type checking has been added to the DAOs to avoid binding to unexpected types. We also cache the type checking result to minimize runtime overhead. + +### Key-value DAO (Local DAO) + +[GMS] use [Local DAO] to store and retrieve metadata [aspect]s from the local document store. +Below shows the base class and its simple key-value interface. +As the DAO is a generic class, it needs to be bound to specific type during instantiation. +Each entity type will need to instantiate its own version of DAO. + +```java +public abstract class BaseLocalDAO { + + public abstract void + add(Class type, URN urn, METADATA value); + + public abstract + Optional get(Class type, URN urn, int version); + + public abstract + ListResult listVersions(Class type, URN urn, int start, + int pageSize); + + public abstract ListResult listUrns( + Class type, int start, int pageSize); + + public abstract + ListResult list(Class type, URN urn, int start, int pageSize); +} +``` + +Another important function of [Local DAO] is to automatically emit [MAE]s whenever the metadata is updated. +This is doable because MAE effectively use the same [Pegasus] models so [RecordTemplate] can be easily converted into the corresponding [GenericRecord]. + +### Search DAO + +Search DAO is also a generic class that can be bound to a specific type of search document. +The DAO provides 3 APIs: +* A `search` API that takes the search input, a [Filter], a [SortCriterion], some pagination parameters, and returns a [SearchResult]. +* An `autoComplete` API which allows typeahead-style autocomplete based on the current input and a [Filter], and returns [AutocompleteResult]. +* A `filter` API which allows for filtering only without a search input. It takes a a [Filter] and a [SortCriterion] as input and returns [SearchResult]. + +```java +public abstract class BaseSearchDAO { + + public abstract SearchResult search(String input, Filter filter, + SortCriterion sortCriterion, int from, int size); + + public abstract AutoCompleteResult autoComplete(String input, String field, + Filter filter, int limit); + + public abstract SearchResult filter(Filter filter, SortCriterion sortCriterion, + int from, int size); +} +``` + +### Query DAO + +Query DAO allows clients, e.g. [GMS](../what/gms.md), [MAE Consumer Job](../architecture/metadata-serving.md#metadata-index-applier-mae-consumer-job) etc, to perform both graph & non-graph queries against the metadata graph. +For instance, a GMS can use the Query DAO to find out “all the dataset owned by the users who is part of the group `foo` and report to `bar`,” which naturally translates to a graph query. +Alternatively, a client may wish to retrieve “all the datasets that stored under /jobs/metrics”, which doesn’t involve any graph traversal. + +Below is the base class for Query DAOs, which contains the `findEntities` and `findRelationships` methods. +Both methods also have two versions, one involves graph traversal, and the other doesn’t. +You can use `findMixedTypesEntities` and `findMixedTypesRelationships` for queries that return a mixture of different types of entities or relationships. +As these methods return a list of [RecordTemplate], callers will need to manually cast them back to the specific entity type using [isInstance()](https://docs.oracle.com/javase/8/docs/api/java/lang/Class.html#isInstance-java.lang.Object-) or reflection. + +Note that the generics (ENTITY, RELATIONSHIP) are purposely left untyped, as these types are native to the underlying graph DB and will most likely differ from one implementation to another. + +```java +public abstract class BaseQueryDAO { + + public abstract List findEntities( + Class type, Filter filter, int offset, int count); + + public abstract List findEntities( + Class type, Statement function); + + public abstract List findMixedTypesEntities(Statement function); + + public abstract List + findRelationships(Class entityType, Class relationshipType, Filter filter, int offset, int count); + + public abstract List + findRelationships(Class type, Statement function); + + public abstract List findMixedTypesRelationships( + Statement function); +} +``` + +### Remote DAO + +[Remote DAO] is nothing but a specialized readonly implementation of [Local DAO]. +Rather than retrieving metadata from a local storage, Remote DAO will fetch the metadata from another [GMS]. +The mapping between [entity] type and GMS is implemented as a hard-coded map. + +To prevent circular dependency ([rest.li] service depends on remote DAO, which in turn depends on rest.li client generated by each rest.li service), +Remote DAO will need to construct raw rest.li requests directly, instead of using each entity’s rest.li request builder. + +## Customizing Search and Graph Index Updates + +In addition to storing and querying metadata, a common requirement is to customize and extend the fields that are being stored in the search or the graph index. + +As described in [Metadata Modelling] section, [Entity], [Relationship], and [Search Document] models do not directly encode the logic of how each field should be derived from metadata. +Instead, this logic needs to be provided in the form of a Java class: a graph or search index builder. + +The builders register the [metadata aspect]s of their interest against [MAE Consumer Job](#mae-consumer-job) and will be invoked whenever a MAE involving the corresponding aspect is received. +If the MAE itself doesn’t contain all the metadata needed, builders can use Remote DAO to fetch from GMS directly. + +```java +public abstract class BaseIndexBuilder { + + BaseIndexBuilder(@Nonnull List> snapshotsInterested); + + @Nullable + public abstract List getDocumentsToUpdate(@Nonnull RecordTemplate snapshot); + + @Nonnull + public abstract Class getDocumentType(); +} +``` + +```java +public interface GraphBuilder { + GraphUpdates build(SNAPSHOT snapshot); + + @Value + class GraphUpdates { + List entities; + List relationshipUpdates; + } + + @Value + class RelationshipUpdates { + List relationships; + BaseGraphWriterDAO.RemovalOption preUpdateOperation; + } +} +``` + + + +[AutocompleteResult]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/AutoCompleteResult.pdl +[Filter]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/Filter.pdl +[SortCriterion]: ../../metadata-dao/src/main/pegasus/com/linkedin/metadata/query/SortCriterion.pdl +[SearchResult]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/SearchResult.java +[RecordTemplate]: https://github.com/linkedin/rest.li/blob/master/data/src/main/java/com/linkedin/data/template/RecordTemplate.java +[GenericRecord]: https://github.com/apache/avro/blob/master/lang/java/avro/src/main/java/org/apache/avro/generic/GenericRecord.java +[DAO]: https://en.wikipedia.org/wiki/Data_access_object +[Pegasus]: https://linkedin.github.io/rest.li/DATA-Data-Schema-and-Templates +[relationship]: ../what/relationship.md +[entity]: ../what/entity.md +[aspect]: ../what/aspect.md +[GMS]: ../what/gms.md +[Local DAO]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/BaseLocalDAO.java +[Remote DAO]: ../../metadata-dao/src/main/java/com/linkedin/metadata/dao/BaseRemoteDAO.java +[MAE]: ../what/mxe.md#metadata-audit-event-mae +[rest.li]: https://rest.li + + +[Metadata Change Event (MCE)]: ../what/mxe.md#metadata-change-event-mce +[Metadata Audit Event (MAE)]: ../what/mxe.md#metadata-audit-event-mae +[MAE]: ../what/mxe.md#metadata-audit-event-mae +[equivalent Pegasus format]: https://linkedin.github.io/rest.li/how_data_is_represented_in_memory#the-data-template-layer +[graph]: ../what/graph.md +[search index]: ../what/search-index.md +[mce-consumer-job]: ../../metadata-jobs/mce-consumer-job +[mae-consumer-job]: ../../metadata-jobs/mae-consumer-job +[Remote DAO]: ../architecture/metadata-serving.md#remote-dao +[URN]: ../what/urn.md +[Metadata Modelling]: ../how/metadata-modelling.md +[Entity]: ../what/entity.md +[Relationship]: ../what/relationship.md +[Search Document]: ../what/search-document.md +[metadata aspect]: ../what/aspect.md +[Python emitters]: https://datahubproject.io/docs/metadata-ingestion/#using-as-a-library diff --git a/docs/imgs/datahub-architecture.png b/docs/imgs/datahub-architecture.png new file mode 100644 index 0000000000000..236f939f74198 Binary files /dev/null and b/docs/imgs/datahub-architecture.png differ diff --git a/docs/imgs/datahub-serving.png b/docs/imgs/datahub-serving.png new file mode 100644 index 0000000000000..67a2f8eb3f085 Binary files /dev/null and b/docs/imgs/datahub-serving.png differ diff --git a/docs/imgs/ingestion-architecture.png b/docs/imgs/ingestion-architecture.png new file mode 100644 index 0000000000000..fc7bc74acacfa Binary files /dev/null and b/docs/imgs/ingestion-architecture.png differ