diff --git a/.htaccess b/.htaccess new file mode 100644 index 00000000..58eeb9f2 --- /dev/null +++ b/.htaccess @@ -0,0 +1,16 @@ + +AddOutputFilterByType DEFLATE text/plain +AddOutputFilterByType DEFLATE text/html +AddOutputFilterByType DEFLATE text/xml +AddOutputFilterByType DEFLATE text/css +AddOutputFilterByType DEFLATE application/xml +AddOutputFilterByType DEFLATE application/xhtml+xml +AddOutputFilterByType DEFLATE application/rss+xml +AddOutputFilterByType DEFLATE application/javascript +AddOutputFilterByType DEFLATE application/x-javascript +AddOutputFilterByType DEFLATE image/jpg +AddOutputFilterByType DEFLATE image/png +AddOutputFilterByType DEFLATE image/gif +AddOutputFilterByType DEFLATE image/jpeg +AddOutputFilterByType DEFLATE image/svg+xml + \ No newline at end of file diff --git a/CNAME b/CNAME new file mode 100644 index 00000000..505ea8a1 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +ldbcouncil.org diff --git a/README.md b/README.md new file mode 100644 index 00000000..eb54e506 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This repository is built based on the source at (private repository). diff --git a/becoming-a-member/index.html b/becoming-a-member/index.html new file mode 100644 index 00000000..3d97a272 --- /dev/null +++ b/becoming-a-member/index.html @@ -0,0 +1,371 @@ + + + + + Becoming a Member + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Becoming a Member

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

Benefits

+

The benefits of LDBC membership are:

+
    +
  • Access to the internal LDBC information via its wiki and mailing lists, which includes access to discussion documents, graph user scenario’s, datasets, draft benchmark specifications and software, results and discussions.
  • +
  • Right to membership of LDBC task forces. Currently, there are task forces on the Semantic Publishing Benchmark, the Social Network Benchmark Interactive, BI and the Graphalytics workloads, as well as the Graph Query Language task force.
  • +
  • Access to research resources at academic partners. This includes the ability to look into research agenda’s provide feedback and establish working relationships with students, as well as arrange targeted internships with MSc and PhD students provided by these partners.
  • +
+

Annual membership fees (2023)

+
    +
  • Sponsor company/institution: 8,800 GBP
  • +
  • Commercial company: 2,200 GBP
  • +
  • Non-commercial institution: 1,100 GBP
  • +
  • Individual Voting member (standard rate): 165 GBP
  • +
  • Individual Voting member (reduced rate for students, etc.): 10 GBP
  • +
  • Individual Associate member: no subscription fee
  • +
+

Note that there is a 2,000 GBP auditing fee per audit for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.

+

Forms

+

For the latest information on becoming an LDBC member, see the LDBC Membership – Joining and Renewing 2023 document.

+

Please fill out the form applicable to your employment status and contribution and email it to the info@ldbcouncil.org address.

+ +

Constitutional documents

+

See the constitutional documents page.

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmark-finbench/index.html b/benchmark-finbench/index.html new file mode 100644 index 00000000..588fa8b8 --- /dev/null +++ b/benchmark-finbench/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/finbench/ + + + + + + diff --git a/benchmark-graphalytics/index.html b/benchmark-graphalytics/index.html new file mode 100644 index 00000000..66b5ac71 --- /dev/null +++ b/benchmark-graphalytics/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/graphalytics/ + + + + + + diff --git a/benchmark-snb/index.html b/benchmark-snb/index.html new file mode 100644 index 00000000..722eb105 --- /dev/null +++ b/benchmark-snb/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/snb/ + + + + + + diff --git a/benchmark-spb/index.html b/benchmark-spb/index.html new file mode 100644 index 00000000..b1746b28 --- /dev/null +++ b/benchmark-spb/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/spb/ + + + + + + diff --git a/benchmarks/fair-use-policies/index.html b/benchmarks/fair-use-policies/index.html new file mode 100644 index 00000000..d7267b13 --- /dev/null +++ b/benchmarks/fair-use-policies/index.html @@ -0,0 +1,418 @@ + + + + + Fair Use Policy for LDBC Benchmarks® + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Fair Use Policy for LDBC Benchmarks®

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+ +

The text of this page is based on our Byelaws.

+

LDBC Benchmarks® and LDBC Benchmark® Results

+

LDBC expects all its members to conscientiously observe the provisions of this Fair Use Policy for LDBC Benchmarks. LDBC-approved auditors must bring this Fair Use Policy for LDBC Benchmarks to the attention of any prospective or actual Test Sponsor. The Board of Directors of LDBC is responsible for enforcing this Policy and any alleged violations should be notified to info@ldbcouncil.org.

+
    +
  1. An “LDBC Draft Benchmark®” is a benchmark specification and any associated tooling or datasets, which has been written by an LDBC Task Force or Working Group whose charter includes the goal of achieving adoption of that specification as an LDBC standard, in accordance with Article 33 of the Articles of Association of the Company, “Approval of Standards”.
  2. +
  3. An “LDBC Benchmark®” is an LDBC Draft Benchmark once it has been adopted as an LDBC standard.
  4. +
  5. A result of a performance test can be fairly described as an “LDBC Benchmark Result”, if the test—which may be executed in several runs all of which use the same System Under Test (SUT)—has been successfully audited by an LDBC-approved auditor, and the result is reported as part of an LDBC Benchmark Results set, so it can be interpreted in context.
  6. +
  7. An audit can only be successful if the audited test +
      +
    1. uses a SUT which faithfully implements the mandatory features and chosen optional features of an LDBC Benchmark ,
    2. +
    3. completely exercises and generates results for all the mandatory requirements and chosen optional requirements of the LDBC Benchmark, and
    4. +
    5. is conducted and audited in conformance with all the relevant provisions of the LDBC Byelaws, including the statement of Total Cost of Ownership for the SUT and the reporting of price/performance metrics, such that the reported results can legitimately be used to compare the price-weighted performance of two SUTs.
    6. +
    +
  8. +
  9. “LDBC Benchmark Results” is a set of all the results of a successfully audited test. A single LDBC Benchmark Result must be reported as part of such a set.
  10. +
  11. Any description or depiction of a specification that states or implies that it is an LDBC Draft Benchmark or an LDBC Benchmark when that is not the case is an infringement of LDBC’s trademark in the term “LDBC BENCHMARK”, which is registered in several major jurisdictions.
  12. +
  13. The same trademark is infringed by any software which is described or promoted as being an implementation of an LDBC Draft Benchmark or LDBC Benchmark, but which does not faithfully implement the features of or does not support the mandatory requirements of the stated specification.
  14. +
  15. The same trademark is infringed by any report or description of one or more performance test results which are not part of set of LDBC Benchmark Results, or in any other way states or implies that the results are endorsed by or originates from LDBC.
  16. +
  17. LDBC considers the use of that trademarked term with respect to performance test results solely in accordance with these Byelaws to be essential to the purpose and reputation of the Company and its benchmark standards.
  18. +
+

Reporting of LDBC Benchmark Results

+

Once an auditor has approved a performance test result, including all required supporting documentation, as being successfully audited, then the Members Council and the Task Force responsible for the benchmark will be notified. The Board will have the results added to the LDBC web site as an LDBC Benchmark Results set according to the following procedure:

+
    +
  1. LDBC members will receive notification of the result via email to their designated contacts within five business days of LDBC receiving the notification.
  2. +
  3. Within five business days of this notice, the LDBC administrator will post the result on the LDBC web site under the rubric “LDBC Benchmark Results” unless the result is withdrawn by the Test Sponsor in the meantime.
  4. +
  5. A result may be challenged and subsequently be withdrawn by the LDBC following a review process as described in Article 7.6.
  6. +
  7. A result that is not challenged within 60 days of its publication will be automatically considered valid and may not be challenged after this time, and this fact will be recorded as part of the website posting of the result.
  8. +
+

Fair Use of the trademark LDBC BENCHMARK

+

Any party wishing to avoid infringement of the trademarked term “LDBC BENCHMARK” should follow the following guidelines relating to its fair use.

+

LDBC encourages use, derived use, study, descriptions, critiques of and suggestions for improvement of LDBC Draft Benchmarks and LDBC Benchmarks. Our benchmark specifications are open-source, and we always welcome new contributors and members. These guidelines are only intended to prevent false or confusing claims relating to performance test results that are intended to be used for product comparisons.

+
    +
  1. If your work is derived from an LDBC Draft or standard Benchmark, or is a partial implementation, or if you are using part of one of our standards for a non-benchmarking purpose, then we would expect you to give attribution, in line with our Creative Commons CC-BY 4.0 licence.
  2. +
  3. We would also suggest that you make a statement, somewhere, somehow, that includes one of these phrases “This is not an LDBC Benchmark”, “This is not an implementation of an LDBC Benchmark” or “These are not LDBC Benchmark Results”.
  4. +
  5. We would also suggest that you explain, however briefly, how your work is related to LDBC standards and how it varies from them.
  6. +
+

An example that illustrates these points: you might say something like this in a presentation:

+
+

“We used the LDBC SNB benchmark as a starting point. This isn’t the official LDBC standard: we added four queries because of X, and we don’t attempt to deal with the ACID requirement. The test results aren’t audited, so we want to be clear that this is not an LDBC Benchmark test run, and these numbers are not LDBC Benchmark Results. If you look at this link on the slide I’m showing you can see all the details of how our work is derived from, and varies from, the SNB 2.0 spec.”

+
+

Or you might say:

+
+

“For this example of a GQL graph type we used the LDBC SNB data model. This is nothing to do with the actual LDBC benchmark specification: we just used their data model as a use-case for illustrating what a graph schema might look like. We took this from the SNB 2.0 spec.”

+
+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/finbench/finbench-talk-16th-tuc.pdf b/benchmarks/finbench/finbench-talk-16th-tuc.pdf new file mode 100644 index 00000000..69d3b523 Binary files /dev/null and b/benchmarks/finbench/finbench-talk-16th-tuc.pdf differ diff --git a/benchmarks/finbench/index.html b/benchmarks/finbench/index.html new file mode 100644 index 00000000..b4c6f206 --- /dev/null +++ b/benchmarks/finbench/index.html @@ -0,0 +1,356 @@ + + + + + LDBC Financial Benchmark (LDBC FinBench) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Financial Benchmark (LDBC FinBench)

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as
+anti-fraud and risk control. It is maintained by the LDBC FinBench Task Force.

+

The benchmark has one workload, Transaction Workload, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph. Its data sets are available in the Google Drive.

+

For a brief overview, see the slides presented in the 16th TUC meeting. The Financial Benchmark’s specification can be found on arXiv.

+

FinBench Audit

+

Audit results

+

There are no audited results yet.

+

Commissioning audits

+

For auditing requests, please reach out at info@ldbcouncil.org. Audits can only be commissioned by LDBC member companies by contracting any of the LDBC-certified auditors. Note that there is a 2,000 GBP auditing fee to be paid for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.

+

Use of audited results

+

Fair use policies

+

The LDBC Social Network Benchmark is subject to the LDBC Fair Use Policies.

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/finbench/ldbc-finbench-work-charter.pdf b/benchmarks/finbench/ldbc-finbench-work-charter.pdf new file mode 100644 index 00000000..53bdc5f2 Binary files /dev/null and b/benchmarks/finbench/ldbc-finbench-work-charter.pdf differ diff --git a/benchmarks/graphalytics/datagen-9_4-fb.md5 b/benchmarks/graphalytics/datagen-9_4-fb.md5 new file mode 100644 index 00000000..7d073131 --- /dev/null +++ b/benchmarks/graphalytics/datagen-9_4-fb.md5 @@ -0,0 +1,9 @@ +38e45cc2157d9ac58aa9b71d88eda597 datagen-9_4-fb-BFS +d92295cd9f3266cd60317be5498a6bb3 datagen-9_4-fb-CDLP +a4a2ec19074dfb8035ba23906b42022e datagen-9_4-fb-LCC +efee23b3046a519fca7eaccb2d25c83b datagen-9_4-fb-PR +0aac4961401679a4677e34464d289a98 datagen-9_4-fb-SSSP +3f0ee48788fb67b7dbfb91f5e9281df8 datagen-9_4-fb-WCC +e27ac41236b43105ec2dc637d9c18f13 datagen-9_4-fb.v +ea88ce7eb69fb8bd04d9675c6063db89 datagen-9_4-fb.e +fee5726a16c9be80e461578aa0de31ec datagen-9_4-fb.properties diff --git a/benchmarks/graphalytics/datagen-sf10k-fb.md5 b/benchmarks/graphalytics/datagen-sf10k-fb.md5 new file mode 100644 index 00000000..a73413fa --- /dev/null +++ b/benchmarks/graphalytics/datagen-sf10k-fb.md5 @@ -0,0 +1,9 @@ +0e0ae322ed19be5b9e27ba3747e80dc7 datagen-sf10k-fb-BFS +18a9a1e0841ed95d8102acf7a1c6f07a datagen-sf10k-fb-CDLP +f4a40d8c28df120ddcf99ee1cce63d8a datagen-sf10k-fb-LCC +916fc2a7441b3f91c1b55b9e31343746 datagen-sf10k-fb-PR +fea3e750b4dd4778e92d8ba8f5fd851c datagen-sf10k-fb-SSSP +8492393a263a97ca7699e4ae305cd6d4 datagen-sf10k-fb-WCC +e0f44f491673601c972e0b6cc1956210 datagen-sf10k-fb.v +f51b3b2b0be31450df064d5dc93afa70 datagen-sf10k-fb.e +8e4899461fe0e600f506949847fd9350 datagen-sf10k-fb.properties diff --git a/benchmarks/graphalytics/datagen-sf3k-fb.md5 b/benchmarks/graphalytics/datagen-sf3k-fb.md5 new file mode 100644 index 00000000..b54434a0 --- /dev/null +++ b/benchmarks/graphalytics/datagen-sf3k-fb.md5 @@ -0,0 +1,9 @@ +61dcb2aa6d9d7bc0bf2371c05fa301fb datagen-sf3k-fb-BFS +ee421e17f579c4ee56a4cca9464af028 datagen-sf3k-fb-CDLP +b3ef2af6288a0d90d575f4af8f8bb37f datagen-sf3k-fb-LCC +2bbb3f2f19945ecda2b1bcb34e328aa1 datagen-sf3k-fb-PR +a22e3c363f42f3ad676669eef2cc5bf9 datagen-sf3k-fb-SSSP +44fc01388f60820e7853de521453bc8f datagen-sf3k-fb-WCC +201c44c0a285602ed8974d45fa27bfdb datagen-sf3k-fb.v +8cee484579faf5e21e6204af1d6b22c5 datagen-sf3k-fb.e +56beff84261ccfdc86c383c87e43b4c2 datagen-sf3k-fb.properties diff --git a/benchmarks/graphalytics/graph500-27.md5 b/benchmarks/graphalytics/graph500-27.md5 new file mode 100644 index 00000000..aa104a98 --- /dev/null +++ b/benchmarks/graphalytics/graph500-27.md5 @@ -0,0 +1,8 @@ +a74369ca68d96540b7a7ad32c5330e18 graph500-27-BFS +0b669d439a6d8720bc76b5ad5dce316b graph500-27-CDLP +039164e31cbcb906ba2d24a95206de33 graph500-27-LCC +c10cd1fd0073d1d8c6129b44493b1a6e graph500-27-PR +e06a58c914d8777a7af4edfb49418b7e graph500-27-WCC +11e30e0644e0664c36d94f1eb6dc1dfe graph500-27.v +578778aa5e034661e8fd09028ecce363 graph500-27.e +ef4abdd11d081f551ad4932a907b5d01 graph500-27.properties diff --git a/benchmarks/graphalytics/graph500-28.md5 b/benchmarks/graphalytics/graph500-28.md5 new file mode 100644 index 00000000..2305a59d --- /dev/null +++ b/benchmarks/graphalytics/graph500-28.md5 @@ -0,0 +1,8 @@ +45dff47cbec77ce77ae2c80f1e64e97a graph500-28-BFS +0aa6a52bbe45e0b2f97581043e14b8fb graph500-28-CDLP +5212bb51a7c285e994feb297358c74b0 graph500-28-LCC +32837223b988a2eb03b82b8f1bed224b graph500-28-PR +c71e6963a1cff7b4392d3d6e9bc2d2e1 graph500-28-WCC +a25fb88d220fde0bffddb60b75237d6e graph500-28.v +02ee07342d20ec3ea850f576b473bb00 graph500-28.e +9bd2866bf0c26e8dbe52ece1c7da839a graph500-28.properties diff --git a/benchmarks/graphalytics/graph500-29.md5 b/benchmarks/graphalytics/graph500-29.md5 new file mode 100644 index 00000000..761e8dc8 --- /dev/null +++ b/benchmarks/graphalytics/graph500-29.md5 @@ -0,0 +1,8 @@ +a79d313775b1afd6e6dad4c1c5f131c1 graph500-29-BFS +94c6b8ac82664973e5f9d4d3dd8292f8 graph500-29-CDLP +5afc9b0bf64d8bd572df54cada249ae4 graph500-29-LCC +e0eb05a8dbea49d53dee18306cd53641 graph500-29-PR +b3f423edac470ced445a8c715ca09942 graph500-29-WCC +b44e047d7aa80b818a4187574a1a5797 graph500-29.v +100cd63d096f228d5b7c9aa1a71a9a85 graph500-29.e +49d4023cdc452de105fef23c88dceae4 graph500-29.properties diff --git a/benchmarks/graphalytics/graph500-30.md5 b/benchmarks/graphalytics/graph500-30.md5 new file mode 100644 index 00000000..dfb5e5cd --- /dev/null +++ b/benchmarks/graphalytics/graph500-30.md5 @@ -0,0 +1,8 @@ +3fd0f5ea5b6f25d82f099f7a0042f9c4 graph500-30-BFS +84b4634fd584babdc3e1494a6417b240 graph500-30-CDLP +89e70b27420916231a830abfd15f8537 graph500-30-LCC +317b3e7da73f501389dcd2c19603a226 graph500-30-PR +c33d8e13d8f84426757816e8ef792a9f graph500-30-WCC +004e13a22aba4c4cf94fc1ea3a460e6f graph500-30.v +3df72be3b189ba9c0c8b6c1f3762eda9 graph500-30.e +bf352710c02f5baea96f63b4e64738ef graph500-30.properties diff --git a/benchmarks/graphalytics/index.html b/benchmarks/graphalytics/index.html new file mode 100644 index 00000000..7ad945c8 --- /dev/null +++ b/benchmarks/graphalytics/index.html @@ -0,0 +1,841 @@ + + + + + LDBC Graphalytics Benchmark (LDBC Graphalytics) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Graphalytics Benchmark (LDBC Graphalytics)

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

The Graphalytics benchmark is an industrial-grade benchmark for graph analysis platforms such as Giraph, Spark GraphX, and GraphBLAS. It consists of six core algorithms, standard data sets, and reference outputs, enabling the objective comparison of graph analysis platforms.

+

The benchmark harness consists of a core component, which is extendable by a driver for each different platform implementation. The benchmark includes the following algorithms:

+
    +
  1. breadth-first search (BFS)
  2. +
  3. PageRank (PR)
  4. +
  5. weakly connected components (WCC)
  6. +
  7. community detection using label propagation (CDLP)
  8. +
  9. local clustering coefficient (LCC)
  10. +
  11. single-source shortest paths (SSSP)
  12. +
+

The choice of these algorithms was carefully motivated, using the LDBC TUC and extensive literature surveys to ensure good coverage of scenarios. The standard data sets include both real and synthetic data sets, which are classified into intuitive “T-shirt” sizes (S, M, L, etc.).

+

Each experiment set in Graphalytics consists of multiple platform runs (a platform executes an algorithm on a data set), and diverse set of experiments are carried out to evaluate different performance characteristics of a system-under-test.

+

All completed benchmarks must go through a strict validation process to ensure the integrity of the performance results.

+

The development of Graphalytics is supported by many active vendors in the field of large-scale graph analytics. Currently, Graphalytics already facilitates benchmarks for a large number of graph analytics platforms, such as GraphBLAS, Giraph, GraphX, and PGX.D, allowing comparison of the state-of-the-art system performance of both community-driven and industrial-driven platforms. To get started, the details of the Graphalyics documentation and its software components are described below.

+

Documents and repositories

+ +

Graphalytics competition 2023

+

In 2023, we will hold a new round of the Graphalytics competition. See the LDBC Graphalytics Benchmark presentation for an introduction to the benchmark framework and the competition’s rules.

+

Artifacts:

+ +

Rules

+
    +
  • Participation is free.
  • +
  • There are no monetary prizes.
  • +
  • Single-node and distributed implementations are allowed.
  • +
  • Partial implementations (e.g. just small to mid-sized data sets and only a few algorithms) are allowed.
  • +
  • Submissions should execute each algorithm-data set combination three times. From these, the arithmetic mean of the processing times is used for ranking.
  • +
  • The results of the competition will be published on the LDBC website in the form of leaderboards, which rank them based on performance and price-performance (adjusted for the system price).
  • +
  • There is a global leaderboard that includes all algorithms and scale factors. Additionally, there is a separate leaderboard for each scale (S, M, L, XL, 2XL+), algorithm and system category (CPU-based/GPU-based, single-node vs. distributed) to for fine-grained comparison.
  • +
  • Submissions are subject to code review and reproducibility attempts from the organizers.
  • +
  • System prices should be reported following the TPC Pricing specification.
  • +
+

Recommendations for submissions

+
    +
  • Submissions using modern hardware are welcome (GPUs, FPGAs, etc.).
  • +
  • We encourage the use of cloud compute instances for running the benchmark (if possible).
  • +
+

Important dates

+
    +
  • March 17: Competition is announced
  • +
  • April 25: Confirmation of intent
  • +
  • May 1: Submissions open
  • +
  • June 25: Submissions close
  • +
+

Data sets

+

The Graphalytics data sets are compressed using zstd. The total size of the compressed archives is approx. 350GB. When decompressed, the data sets require approximately 1.5TB of disk space.

+

For detailed information on the data sets, see the table with their statistics.

+

The data sets are available in two locations:

+ +

Note that some of the Graphalytics data sets were fixed in March 2023. Prior to this, they were incorrectly packaged or had missing/incorrect reference outputs for certain algorithms. If you are uncertain whether you have the correct versions, cross-check them against these MD5 checksums: datagen-9_4-fb, datagen-sf3k-fb, datagen-sf10k-fb, graph500-27, graph500-28, graph500-29, graph500-30.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
data set#nodes#edgesscalelinksize
cit-Patents3,774,76816,518,947XScit-Patents.tar.zst119.1 MB
com-friendster65,608,3661,806,067,135XLcom-friendster.tar.zst6.7 GB
datagen-7_5-fb633,43234,185,747Sdatagen-7_5-fb.tar.zst162.3 MB
datagen-7_6-fb754,14742,162,988Sdatagen-7_6-fb.tar.zst200.0 MB
datagen-7_7-zf13,180,50832,791,267Sdatagen-7_7-zf.tar.zst434.5 MB
datagen-7_8-zf16,521,88641,025,255Sdatagen-7_8-zf.tar.zst544.3 MB
datagen-7_9-fb1,387,58785,670,523Sdatagen-7_9-fb.tar.zst401.2 MB
datagen-8_0-fb1,706,561107,507,376Mdatagen-8_0-fb.tar.zst502.5 MB
datagen-8_1-fb2,072,117134,267,822Mdatagen-8_1-fb.tar.zst625.4 MB
datagen-8_2-zf43,734,497106,440,188Mdatagen-8_2-zf.tar.zst1.4 GB
datagen-8_3-zf53,525,014130,579,909Mdatagen-8_3-zf.tar.zst1.7 GB
datagen-8_4-fb3,809,084269,479,177Mdatagen-8_4-fb.tar.zst1.2 GB
datagen-8_5-fb4,599,739332,026,902Ldatagen-8_5-fb.tar.zst1.5 GB
datagen-8_6-fb5,667,674421,988,619Ldatagen-8_6-fb.tar.zst1.9 GB
datagen-8_7-zf145,050,709340,157,363Ldatagen-8_7-zf.tar.zst4.6 GB
datagen-8_8-zf168,308,893413,354,288Ldatagen-8_8-zf.tar.zst5.3 GB
datagen-8_9-fb10,572,901848,681,908Ldatagen-8_9-fb.tar.zst3.7 GB
datagen-9_0-fb12,857,6711,049,527,225XLdatagen-9_0-fb.tar.zst4.6 GB
datagen-9_1-fb16,087,4831,342,158,397XLdatagen-9_1-fb.tar.zst5.8 GB
datagen-9_2-zf434,943,3761,042,340,732XLdatagen-9_2-zf.tar.zst13.7 GB
datagen-9_3-zf555,270,0531,309,998,551XLdatagen-9_3-zf.tar.zst17.4 GB
datagen-9_4-fb29,310,5652,588,948,669XLdatagen-9_4-fb.tar.zst14.0 GB
datagen-sf3k-fb33,484,3752,912,009,743XLdatagen-sf3k-fb.tar.zst12.7 GB
datagen-sf10k-fb100,218,7509,404,822,5382XLdatagen-sf10k-fb.tar.zst40.5 GB
dota-league61,17050,870,313Sdota-league.tar.zst114.3 MB
graph500-222,396,65764,155,735Sgraph500-22.tar.zst202.4 MB
graph500-234,610,222129,333,677Mgraph500-23.tar.zst410.6 MB
graph500-248,870,942260,379,520Mgraph500-24.tar.zst847.7 MB
graph500-2517,062,472523,602,831Lgraph500-25.tar.zst1.7 GB
graph500-2632,804,9781,051,922,853XLgraph500-26.tar.zst3.4 GB
graph500-2763,081,0402,111,642,032XLgraph500-27.tar.zst7.1 GB
graph500-28121,242,3884,236,163,9582XLgraph500-28.tar.zst14.4 GB
graph500-29232,999,6308,493,569,1152XLgraph500-29.tar.zst29.6 GB
graph500-30447,797,98617,022,117,3623XLgraph500-30.tar.zst60.8 GB
kgs832,24717,891,698XSkgs.tar.zst65.7 MB
twitter_mpi52,579,6781,963,263,508XLtwitter_mpi.tar.zst5.7 GB
wiki-Talk2,394,3855,021,4102XSwiki-Talk.tar.zst34.9 MB
example-directed1017-example-directed.tar.zst1.0 KB
example-undirected912-example-undirected.tar.zst1.0 KB
test-bfs-directed<100<100-test-bfs-directed.tar.zst<2.0 KB
test-bfs-undirected<100<100-test-bfs-undirected.tar.zst<2.0 KB
test-cdlp-directed<100<100-test-cdlp-directed.tar.zst<2.0 KB
test-cdlp-undirected<100<100-test-cdlp-undirected.tar.zst<2.0 KB
test-pr-directed<100<100-test-pr-directed.tar.zst<2.0 KB
test-pr-undirected<100<100-test-pr-undirected.tar.zst<2.0 KB
test-lcc-directed<100<100-test-lcc-directed.tar.zst<2.0 KB
test-lcc-undirected<100<100-test-lcc-undirected.tar.zst<2.0 KB
test-wcc-directed<100<100-test-wcc-directed.tar.zst<2.0 KB
test-wcc-undirected<100<100-test-wcc-undirected.tar.zst<2.0 KB
test-sssp-directed<100<100-test-sssp-directed.tar.zst<2.0 KB
test-sssp-undirected<100<100-test-sssp-undirected.tar.zst<2.0 KB
+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/graphalytics/spec-graphalytics-competitions.pdf b/benchmarks/graphalytics/spec-graphalytics-competitions.pdf new file mode 100644 index 00000000..51a0a47d Binary files /dev/null and b/benchmarks/graphalytics/spec-graphalytics-competitions.pdf differ diff --git a/benchmarks/index.html b/benchmarks/index.html new file mode 100644 index 00000000..bcee4370 --- /dev/null +++ b/benchmarks/index.html @@ -0,0 +1,765 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+ +
+ +

We are delighted to announce the official release of the initial version (v0.1.0) of Financial Benchmark (FinBench).

+

The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the LDBC FinBench Task Force. The benchmark has one workload currently, Transaction Workload, capturing OLTP scenario with complex read queries that access the …

+ +
+
+ +
+ + +
+
+
+ +

Posts

+
Tags:
+ +
+
+ + + +
+
+ +
+ + +
+
+
+ +

LDBC SNB – Early 2023 updates

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

2023 has been an eventful year for us so far. Here is a summary of our recent activities.

+
    +
  1. +

    Our paper The LDBC Social Network Benchmark: Business Intelligence Workload was published in PVLDB.

    +
  2. +
  3. +

    David Püroja just completed his MSc thesis on creating a design towards SNB Interactive v2 at CWI’s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference’s graph developer room titled The LDBC Social Network …

+ +
+
+ +
+ + +
+
+
+ +

LDBC SNB Datagen – The winding path to SF100K

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my last technical update on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the …

+ +
+
+ +
+ + +
+
+ +
+ +

We are delighted to announce the set up of the Financial Benchmark (FinBench) task force.

+

The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the …

+ +
+
+ +
+ + +
+
+
+ +

Speeding Up LDBC SNB Datagen

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC’s Social Network Benchmark [4] (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems’ bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/index.xml b/benchmarks/index.xml new file mode 100644 index 00000000..f4e881e2 --- /dev/null +++ b/benchmarks/index.xml @@ -0,0 +1,5297 @@ + + + + Benchmarks on Linked Data Benchmark Council + https://ldbcouncil.org/benchmarks/ + Recent content in Benchmarks on Linked Data Benchmark Council + Hugo -- gohugo.io + en-us + &copy; Copyright LDBC 2024 + + Announcing the Official Release of LDBC Financial Benchmark v0.1.0 + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + Tue, 27 Jun 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + <p>We are delighted to announce the official release of the initial version (v0.1.0) of <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench)</a>.</p> +<p>The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">LDBC FinBench Task Force</a>. The benchmark has one workload currently, <strong>Transaction Workload</strong>, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.</p> +<p>Compared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the <a href="https://ldbcouncil.org/benchmarks/finbench/finbench-talk-16th-tuc.pdf">slides</a> in the 16th TUC. The <a href="https://arxiv.org/pdf/2306.15975.pdf">Financial Benchmark&rsquo;s specification</a> can be found on arXiv.</p> +<p>The release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.</p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.</p> + + + + + Sixteenth TUC Meeting + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + Fri, 23 Jun 2023 09:00:00 -0800 + + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Oskar van Rest, Alastair Green, Gábor Szárnyas</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2023.sigmod.org/venue.shtml">SIGMOD 2023</a> on <strong>June 23-24 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10- and 15-minute talks followed by a Q&amp;A session. The talks will be recorded and made available online. <strong>If you would like to participate please register using <a href="https://forms.gle/T6bwVHzK9V5FaKyR9">our form</a>.</strong></p> +<p>LDBC will host a <strong>social event</strong> on Friday at the <a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a>.</p> +<p>In addition, AWS will host a <strong>Happy Hour</strong> (rooftop grill with beverages) on Saturday on the Amazon Nitro South building&rsquo;s 8th floor deck: <a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>.</p> +<h3 id="program">Program</h3> +<p><strong>All times are in PDT.</strong></p> +<h4 id="friday">Friday</h4> +<p><strong>Location:</strong> Hyatt Regency Bellevue on Seattle&rsquo;s Eastside, <strong>room Grand K</strong>, co-located with SIGMOD (<a href="https://www.hyatt.com/en-US/hotel/washington/hyatt-regency-bellevue-on-seattles-eastside/belle">900 Bellevue Way NE, Bellevue, WA 98004-4272</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>08:30</td> +<td>08:45</td> +<td>Oskar van Rest (Oracle)</td> +<td>LDBC – State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-ldbc-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/Frk7ITssaSY">video</a></td> +</tr> +<tr> +<td>08:50</td> +<td>09:05</td> +<td>Keith Hare (JCC / WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/keith-hare-an-update-on-the-gql-and-sql-pgq-standards-efforts.pdf">slides</a>, <a href="https://youtu.be/LQYkal_0j6E">video</a></td> +</tr> +<tr> +<td>09:10</td> +<td>09:25</td> +<td>Stefan Plantikow (Neo4j / WG3)</td> +<td>GQL - Introduction to a new query language standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/stefan-plantikow-gql-v1.pdf">slides</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Leonid Libkin (University of Edinburgh &amp; RelationalAI)</td> +<td>Formalizing GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/leonid-libkin-formalizing-gql.pdf">slides</a>, <a href="https://youtu.be/YZE1a00h1I4">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Semen Panenkov (JetBrains Research)</td> +<td>Mechanizing the GQL semantics in Coq – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/semyon-panenkov-gql-in-coq.pdf">slides</a>, <a href="https://youtu.be/5xBGohqWCzo">videos</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Oskar van Rest (Oracle)</td> +<td>SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-sql-property-graphs-in-oracle-database-and-oracle-graph-server-pgx.pdf">slides</a>, <a href="https://youtu.be/owM9WiQubpg">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (JCC)</td> +<td>LDBC&rsquo;s organizational changes and fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-ldbc-corporate-restructuring-and-fair-use-policies.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>Ioana Manolescu (INRIA)</td> +<td>Integrating Connection Search in Graph Queries – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ioana-manolescu-integrating-connection-search-in-graph-queries.pdf">slides</a>, <a href="https://youtu.be/LQPnmcrkUpY">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Maciej Besta (ETH Zurich)</td> +<td>Neural Graph Databases with Graph Neural Networks – <a href="https://youtu.be/ce5qNievRNs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:10</td> +<td>Longbin Lai (Alibaba Damo Academy)</td> +<td>To Revisit Benchmarking Graph Analytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/longbin-lai-benchmark-ldbc.pdf">slides</a>, <a href="https://youtu.be/s9Vtt-6t_FI">video</a></td> +</tr> +<tr> +<td>12:15</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>The World of Graph Databases from An Industry Perspective – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/yuanyuan-tian-world-of-graph-databases.pdf">slides</a>, <a href="https://youtu.be/AZuP_b95GPM">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Alin Deutsch (UC San Diego &amp; TigerGraph)</td> +<td>TigerGraph&rsquo;s Parallel Computation Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alin-deutsch-tigergraphs-computation-model.pdf">slides</a>, <a href="https://youtu.be/vcxdieJB80Y">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Chen Zhang (CreateLink)</td> +<td>Applications of a Native Distributed Graph Database in the Financial Industry – <a href="https://youtu.be/GCCT79Sps9I">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Ricky Sun (Ultipa)</td> +<td>Design of highly scalable graph database systems – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ricky-sun-ultipa.pdf">slides</a>, <a href="https://youtu.be/Sg1F64O4vGM">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Heng Lin (Ant Group)</td> +<td>The LDBC SNB implementation in TuGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-the-ldbc-snb-implementation-in-tugraph.pdf">slides</a>, <a href="https://youtu.be/fy8AuVerwnY">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>Shipeng Qi (Ant Group)</td> +<td>FinBench: The new LDBC benchmark targeting financial scenario – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/shipeng-qi-finbench.pdf">slides</a>, <a href="https://youtu.be/0xLZadDOfZk">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>17:00</td> +<td>host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>FinBench panel – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-finbench-panel.pdf">slides</a></td> +</tr> +<tr> +<td>19:00</td> +<td>22:00</td> +<td><em>dinner</em></td> +<td><em><a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a></em></td> +</tr> +</tbody> +</table> +<h4 id="saturday">Saturday</h4> +<p><strong>Location:</strong> Amazon Nitro South building, <strong>room 03.204</strong> (<a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:00</td> +<td>09:45</td> +<td>Brad Bebee (AWS)</td> +<td>Customers don&rsquo;t want a graph database, so why are we still here? – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/brad-bebee-tuc-keynote.pdf">slides</a>, <a href="https://youtu.be/bJlkpDC--fM">video</a></td> +</tr> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Fast and Efficient Update Handling for Graph H2TAP – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/muhammad-attahir-jibril-fast-and-efficient-update-handling-for-graph-h2tap.pdf">slides</a>, <a href="https://youtu.be/e8ZAszBsXV0">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Gabor Szarnyas (CWI)</td> +<td>LDBC Social Network Benchmark and Graphalytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-social-network-benchmark-and-graphalytics.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:30</td> +<td>Atanas Kiryakov and Tomas Kovachev (Ontotext)</td> +<td>GraphDB – Benchmarking against LDBC SNB &amp; SPB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tomas-kovatchev-atanas-kiryakov-benchmarking-graphdb-with-snb-and-spb.pdf">slides</a>, <a href="https://youtu.be/U6OPpNFOWqg">video</a></td> +</tr> +<tr> +<td>11:35</td> +<td>11:50</td> +<td>Roi Lipman (Redis Labs)</td> +<td>Delta sparse matrices within RedisGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/roi-lipman-delta-matrix.pdf">slides</a>, <a href="https://youtu.be/qfKsplV4Ihk">video</a></td> +</tr> +<tr> +<td>11:55</td> +<td>12:05</td> +<td>Rathijit Sen (Microsoft)</td> +<td>Microarchitectural Analysis of Graph BI Queries on RDBMS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/rathijit-sen-microarchitectural-analysis.pdf">slides</a>, <a href="https://youtu.be/55B8CkH09js">video</a></td> +</tr> +<tr> +<td>12:10</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td><em>on your own</em></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Alastair Green (JCC)</td> +<td>LEX &ndash; LDBC Extended GQL Schema – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-lex.pdf">slides</a>, <a href="https://youtu.be/DVpeb4Ce9Uw">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Ora Lassila (AWS)</td> +<td>Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ora-lassila-why-limit-yourself-to-lpg-when-you-can-do-rdf-too.pdf">slides</a>, <a href="https://youtu.be/7uAInoUwdds">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Jan Hidders (Birkbeck, University of London)</td> +<td>PG-Schema: a proposal for a schema language for property graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/jan-hidders-pg-schema.pdf">slides</a>, <a href="https://youtu.be/yQNL8hBTE4M">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Max de Marzi (RageDB and RelationalAI)</td> +<td>RageDB: Building a Graph Database in Anger – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/max-de-marzi-ragedb-building-a-graph-database-in-anger.pdf">slides</a>, <a href="https://youtu.be/LBbF8aslYFE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Umit Catalyurek (AWS)</td> +<td>HPC Graph Analytics on the OneGraph Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/umit-catalyurek-onegraph-hpc.pdf">slides</a>, <a href="https://youtu.be/64tv5LA6Wr8">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>David J. Haglin (Trovares)</td> +<td>How LDBC impacts Trovares – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/david-haglin-trovares.pdf">slides</a>, <a href="">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>16:25</td> +<td>Wenyuan Yu (Alibaba Damo Academy)</td> +<td>GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/wenyuan-yu-graphscope-flex.pdf">slides</a>, <a href="https://youtu.be/cRikoyDmMks">video</a></td> +</tr> +<tr> +<td>16:30</td> +<td>16:40</td> +<td>Scott McMillan (Carnegie Mellon University)</td> +<td>Graph processing using GraphBLAS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/scott-mcmillan-graph-processing-using-graphblas.pdf">slides</a>, <a href="https://youtu.be/yb4hGBhUzQQ">video</a></td> +</tr> +<tr> +<td>16:45</td> +<td>16:55</td> +<td>Tim Mattson (Intel)</td> +<td>Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tim-mattson-graphblas-and-tiledb.pdf">slides</a></td> +</tr> +<tr> +<td>17:00</td> +<td>20:00</td> +<td><em>happy hour (rooftop grill with beverages)</em></td> +<td><em>on the Nitro South building&rsquo;s 8th floor deck</em></td> +</tr> +</tbody> +</table> +<h4 id="tuc-event-locations">TUC event locations</h4> +<p>A <a href="https://www.google.com/maps/d/u/0/edit?mid=19_fi4fV-3-PZkNWCCcmhU86ct2EZXbgo">map of the LDBC TUC events</a> we hosted so far.</p> + + + + + LDBC SNB – Early 2023 updates + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + Wed, 15 Feb 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + <p>2023 has been an eventful year for us so far. Here is a summary of our recent activities.</p> +<ol> +<li> +<p>Our paper <a href="https://ldbcouncil.org/docs/papers/ldbc-snb-bi-vldb-2022.pdf">The LDBC Social Network Benchmark: Business Intelligence Workload</a> was published in PVLDB.</p> +</li> +<li> +<p>David Püroja just completed his MSc thesis on creating a design towards <a href="https://ldbcouncil.org/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf">SNB Interactive v2</a> at CWI&rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference&rsquo;s graph developer room titled <a href="https://fosdem.org/2023/schedule/event/graph_ldbc/">The LDBC Social Network Benchmark</a> (<a href="https://www.youtube.com/watch?v=YNF6z6gtXY4">YouTube mirror</a>).</p> +</li> +<li> +<p>I gave a lightning talk at FOSDEM&rsquo;s HPC developer room titled <a href="https://www.youtube.com/watch?v=q26DHnQFw54">The LDBC Benchmark Suite</a> (<a href="https://www.youtube.com/watch?v=q26DHnQFw54">YouTube mirror</a>).</p> +</li> +<li> +<p>Our auditors have successfully benchmark a number of systems:</p> +<ul> +<li>SPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze)</li> +<li>SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja)</li> +<li>SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr)</li> +</ul> +</li> +</ol> +<p>The results and the full disclosure reports are available under the <a href="https://ldbcouncil.org/benchmarks/spb/">SPB</a> and <a href="https://ldbcouncil.org/benchmarks/snb/">SNB benchmark pages</a>.</p> + + + + + LDBC SNB Datagen – The winding path to SF100K + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + Tue, 13 Sep 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + <p>LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">last technical update</a> on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark&rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.</p> +<h1 id="moving-to-sparksql">Moving to SparkSQL</h1> +<p>We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL&rsquo;s capabilites.</p> +<blockquote> +<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.</p> +</blockquote> +<p>Dealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup>, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators&rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:</p> +<ol> +<li>table manipulations related to shaping the output into the supported layouts and data types as set forth in the specification;</li> +<li>deriving the Interactive and BI datasets;</li> +<li>and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes.</li> +</ol> +<p>We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.</p> +<p><img src="datagen_df_0.png" alt="Datagen stages"></p> +<p>The diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there&rsquo;s no simple way to avoid<br> +it, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.</p> +<p>I&rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">previous blogpost in the series</a> or the <a href="https://arxiv.org/abs/2001.02299">Interactive benchmark specification</a>.</p> +<h1 id="transformation-pipeline">Transformation pipeline</h1> +<p>The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:</p> +<ul> +<li>explodes edges and / or attributes into separate tables,</li> +<li>subsets the snapshot part and creates insert / delete batches for the BI workload,</li> +<li>subsets the snapshot part for the Interactive workload,</li> +<li>applies formatting related options such as date time representation,</li> +<li>serializes the data to a Spark supported format (CSV, Parquet),</li> +</ul> +<p>We utilize a flexible data pipeline that operates on the graph.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span>, <span style="color:#66d9ef">M2</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">]</span> <span style="color:#a6e22e">extends</span> <span style="color:#f92672">(</span><span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">])</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">In</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> apply<span style="color:#f92672">(</span>v<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">])</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>v<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>The <code>Transform</code> trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let&rsquo;s see some of the transformations we have.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToBiTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">BI</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> keepImplicitDeletes<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.BI</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeEdges</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Therefore, a transformation pipeline may look like this:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>params<span style="color:#f92672">,</span> start<span style="color:#f92672">,</span> end<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>inputGraph<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>The <code>Graph</code> record has a <code>definition</code> field containing graph-global metadata, whereas <code>entities</code> holds the datasets keyed by their entity type. There are 3 graph <em>modes</em> currently: <code>Raw</code>, <code>Interactive</code> and <code>BI</code>. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the <code>Layout</code> dependent type, over which the entities are polymorphic.</p> +<p>It&rsquo;s important to understand that <code>Graph</code> holds <code>DataFrame</code>s, and these are lazily computed by Spark. So, <code>Graph</code> is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> isAttrExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> isEdgesExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> useTimestamp<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">M</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">Option</span><span style="color:#f92672">[</span><span style="color:#66d9ef">String</span><span style="color:#f92672">]]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> definition<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M</span><span style="color:#f92672">],</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">M</span><span style="color:#66d9ef">#</span><span style="color:#66d9ef">Layout</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">sealed</span> <span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Raw</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Interactive</span><span style="color:#f92672">(</span>bulkLoadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">BI</span><span style="color:#f92672">(</span>bulkloadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">,</span> batchPeriod<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">String</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">BatchedEntity</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>You may notice that <code>Transform</code> is statically typed w.r.t. <code>Mode</code>, however other properties, like <code>isAttrExploded</code>, or <code>isEdgesExploded</code> are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.</p> +<p>As we already mentioned, <code>Graph</code> is essentially a persistent container of <code>EntityType -&gt; DataFrame</code> mappings. <code>EntityType</code> can be <code>Node</code>, <code>Edge</code> and <code>Attr</code>, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.</p> +<p>Usually, a graph transformation involves matching entities based on their <code>EntityType</code>, and modifying the mapping (and if required, other metadata). Take, for example, the <code>ExplodeAttrs</code> transformation, which explodes into separate tables the values of two columns of <code>Person</code> stored as arrays:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">if</span> <span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#75715e">// assert at runtime that the transformation hasn&#39;t been applied yet +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#66d9ef">throw</span> <span style="color:#66d9ef">new</span> <span style="color:#a6e22e">AssertionError</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Attributes already exploded in the input graph&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> explodedAttr<span style="color:#f92672">(</span>attr<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Attr</span><span style="color:#f92672">,</span> node<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">DataFrame</span><span style="color:#f92672">,</span> column<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Column</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">=</span> +</span></span><span style="display:flex;"><span> attr <span style="color:#f92672">-&gt;</span> node<span style="color:#f92672">.</span>select<span style="color:#f92672">(</span>withRawColumns<span style="color:#f92672">(</span>attr<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>parent<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">),</span> explode<span style="color:#f92672">(</span>split<span style="color:#f92672">(</span>column<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;;&#34;</span><span style="color:#f92672">)).</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>attribute<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">)))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> modifiedEntities <span style="color:#66d9ef">=</span> input<span style="color:#f92672">.</span>entities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>collect <span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k <span style="color:#66d9ef">@</span> <span style="color:#a6e22e">Node</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person&#34;</span><span style="color:#f92672">,</span> <span style="color:#66d9ef">false</span><span style="color:#f92672">),</span> df<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#75715e">// match the Person node. This is the only one ExplodeAttrs should modify +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#a6e22e">Map</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Email&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;EmailAddress&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonEmailEmailAddress&#34; entity derived by exploding the email column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Speaks&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;Language&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonSpeaksLanguage&#34; entity derived by exploding the language column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> k <span style="color:#f92672">-&gt;</span> df<span style="color:#f92672">.</span>drop<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">)</span> <span style="color:#75715e">// drop the exploded columns from person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntities <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>entities<span style="color:#f92672">)(</span><span style="color:#66d9ef">_</span> <span style="color:#f92672">++</span> <span style="color:#66d9ef">_</span><span style="color:#f92672">)</span> <span style="color:#75715e">// merge-replace the modified entities in the graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntityDefinitions <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#f92672">(</span>e<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> e <span style="color:#f92672">++</span> v<span style="color:#f92672">.</span>map<span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> k <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Some</span><span style="color:#f92672">(</span>v<span style="color:#f92672">.</span>schema<span style="color:#f92672">.</span>toDDL<span style="color:#f92672">)</span> <span style="color:#f92672">}</span> <span style="color:#75715e">// update the entity definition schema to reflect the modifications +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> l <span style="color:#66d9ef">=</span> lens<span style="color:#f92672">[</span><span style="color:#66d9ef">In</span><span style="color:#f92672">]</span> <span style="color:#75715e">// lenses provide a terse syntax for modifying nested fields +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">(</span>l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>entities<span style="color:#f92672">).</span>set<span style="color:#f92672">(</span>input<span style="color:#f92672">)((</span><span style="color:#66d9ef">true</span><span style="color:#f92672">,</span> updatedEntityDefinitions<span style="color:#f92672">,</span> updatedEntities<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Note that <code>EntityType</code> does not hold the dataset&rsquo;s full SQL schema currently, as it&rsquo;s not useful for pattern matching, but can be accessed directly from <code>DataFrame</code> if needed.</p> +<h1 id="inputoutput">Input/output</h1> +<p>The <code>Reader</code> and <code>Writer</code> typeclasses are used to read from a <code>Source</code> and write to a <code>Sink</code> respectively, terminating a graph transformation pipeline<br> +on both ends.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">T</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> read<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> exists<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">S</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Data</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> write<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Data</span><span style="color:#f92672">,</span> sink<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">S</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Unit</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>There are implementations under <code>ldbc.datagen.io.instances</code> that read a graph from a <code>GraphSource</code> and write to a <code>GraphSink</code>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> source <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSource</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>read<span style="color:#f92672">(</span>source<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span><span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>graph<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> sink <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSink</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>write<span style="color:#f92672">(</span>transformedGraph<span style="color:#f92672">,</span> sink<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>We provide <a href="https://github.com/typelevel/simulacrum">Ops syntax</a> to make it shorter:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Reader.ops._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Writer.ops._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">).</span>read +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> <span style="color:#f92672">???</span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span>transformedGraph<span style="color:#f92672">.</span>write<span style="color:#f92672">(</span><span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">))</span> +</span></span></code></pre></div><p>The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.</p> +<p>Spark has a facility to derive SparkSQL schema from case classes automatically<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup>. We created case classes for each entity in the <code>Raw</code> dataset. We also created a typeclass <code>EntityTraits</code> associating these classes with their <code>EntityType</code>, so we can summon them (and consequently their SparkSQL schema) in the reader.</p> +<p>The case classes are used during the serialization of the generated dataset too, but more about that later.</p> +<h1 id="factor-generation">Factor generation</h1> +<p>As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.</p> +<p>We decided it&rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator&rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use <a href="https://spark.apache.org/graphx/">GraphX</a>, which was unimaginable with the previous design.</p> +<p>Factor tables are added by extending a map with a <code>name -&gt; Factor</code> pair. <code>Factor</code> declares is input entities, and accepts a function that receives input <code>DataFrames</code>, and returns a single <code>DataFrame</code> as output.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> factors <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Map</span> <span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;personDisjointEmployerPairs&#34;</span> <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Factor</span><span style="color:#f92672">(</span><span style="color:#a6e22e">PersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonKnowsPersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">OrganisationType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonWorkAtCompanyType</span><span style="color:#f92672">)</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">case</span> <span style="color:#a6e22e">Seq</span><span style="color:#f92672">(</span>person<span style="color:#f92672">,</span> personKnowsPerson<span style="color:#f92672">,</span> organisation<span style="color:#f92672">,</span> workAt<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> knows <span style="color:#66d9ef">=</span> undirectedKnows<span style="color:#f92672">(</span>personKnowsPerson<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> company <span style="color:#66d9ef">=</span> organisation<span style="color:#f92672">.</span>where<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;Type&#34;</span> <span style="color:#f92672">===</span> <span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">).</span>cache<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> personSample <span style="color:#66d9ef">=</span> person +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>orderBy<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>limit<span style="color:#f92672">(</span><span style="color:#ae81ff">20</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> personSample +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person2&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>knows<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;knows&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;knows.person2Id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>workAt<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;workAt&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;workAt.PersonId&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;knows.Person1id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>company<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;Company.id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;workAt.CompanyId&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>select<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2id&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.name&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyName&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyId&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.creationDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2creationDate&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.deletionDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2deletionDate&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>distinct<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">},</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* more factors */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span></code></pre></div><p>As you can see, it&rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. <code>undirectedKnows</code>). Currently, there&rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.</p> +<h1 id="revamping-the-data-generators-serializer">Revamping the data generator&rsquo;s serializer</h1> +<p>At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator&rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we&rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.</p> +<blockquote> +<p><a href="https://parquet.apache.org/">Parquet</a> is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.</p> +</blockquote> +<p>The new serialization framework is heavily influenced by the design of Java <code>OutputStreams</code>, in the sense that stateful objects are composed to form a pipeline. For example, in case of <em>activities</em>, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).</p> +<p><img src="activity.png" alt="Activity serialization pipeline"></p> +<p>The benefit of this architecture is that only the last component needs to change when we add support for a new output format.</p> +<p>To support Parquet, we made use of row-level serializers available in Hadoop&rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the <code>Raw</code> entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. <code>Forum</code>) and Spark&rsquo;s <code>Encoder</code> framework to encode the entities in Parquet, which means that the generated output remains consistent with <code>DataFrame</code>-based reader, and we spare a lot of code duplication.</p> +<h1 id="optimizations">Optimizations</h1> +<p>After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation<sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup>. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren&rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf3k_bi <span style="color:#ae81ff">3000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">330</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf10k_bi <span style="color:#ae81ff">10000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">1000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><p>The above examples working configurations for generating the 3K and 10K BI datasets. The <code>--sf-per-executor</code> option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The <code>--partitions</code> option controls the total number of partitions, and was calculated based on the number of persons using the formula <code>partitions = ceil(number_of_persons / block_size / 3)</code> to get a maximum of 3 blocks per partition.</p> +<h1 id="conclusion">Conclusion</h1> +<p>These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.</p> +<h1 id="footnotes">Footnotes</h1> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>Shameless plug: You can learn more on this from <a href="https://www.dataversity.net/case-study-deriving-spark-encoders-and-schemas-using-implicits/">another blogpost of mine</a>.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>The datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>The maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Fifteenth TUC Meeting + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + Fri, 17 Jun 2022 09:20:00 -0500 + + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2022.sigmod.org/venue.shtml">SIGMOD 2022</a> on <strong>June 17-18 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10-15 minute talks followed by a Q&amp;A session. The talks will be recorded and made available online.<br> +The tenative program is the following. <strong>All times are in EDT.</strong></p> +<p>We will have a social event on Friday at 17:30 at <a href="https://elvezrestaurant.com/">El Vez</a> (<a href="https://g.page/ElVezPhilly">Google Maps</a>).</p> +<h4 id="friday-pennsylvania-convention-centerhttpswwwpaconventioncom-room-204bhttps2022sigmodorgprogramshtml">Friday (<a href="https://www.paconvention.com/">Pennsylvania Convention Center</a>, <a href="https://2022.sigmod.org/program.shtml">room 204B</a>)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:20</td> +<td>09:30</td> +<td>Peter Boncz (LDBC/CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/39BoOIGk9Is">video</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Alastair Green (LDBC/Birkbeck)</td> +<td>LDBC&rsquo;s fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-fair-use-of-the-ldbc-trademark.pdf">slides</a>, <a href="https://youtu.be/7zmCysN4Rpg">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)</td> +<td>LDBC Social Network Benchmark: Business Intelligence workload v1.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/gabor-szarnyas-the-ldbc-social-network-benchmark-business-intelligence-workload.pdf">slides</a>, <a href="https://youtu.be/AJ96M8_njxE">video</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Heng Lin (Ant Group)</td> +<td>LDBC Financial Benchmark introduction – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/heng-lin-ldbc-financial-benchmark-introduction.pdf">slides</a>, <a href="https://youtu.be/iBhud_YjafY">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Chen Zhang (CreateLink)</td> +<td>New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/chen-zhang-new-ldbc-snb-benchmark-record-by-galaxybase-more-than-6-times-faster-and-70-percent-higher-throughput.pdf">slides</a>, <a href="https://youtu.be/sMzTsb8iw_Y">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>James Clarkson (Neo4j)</td> +<td>LDBC benchmarks: Promoting good science and industrial consumption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/james-clarkson-ldbc-benchmarks-promoting-good-science-and-industrial-consumption.pdf">slides</a>, <a href="https://youtu.be/VYG1mzcl9qQ">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Oskar van Rest (Oracle)</td> +<td>Creating and querying property graphs in Oracle, on-premise and in the cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oskar-van-rest-creating-and-querying-property-graphs-in-oracle-on-premise-and-in-the-cloud.pdf">slides</a>, <a href="https://youtu.be/2HX2Vixf2gs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:15</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>Conquering LDBC SNB BI at SF-10k – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/mingxi-wu-conquering-ldbc-snb-bi-at-sf10k.pdf">slides</a>, <a href="https://youtu.be/oJbqzQ_t3G8">video</a></td> +</tr> +<tr> +<td>12:20</td> +<td>13:20</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:20</td> +<td>13:35</td> +<td>Altan Birler (Technische Universität München)</td> +<td>Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/altan-birler-relational-databases-can-handle-graphs-too.pdf">slides</a>, <a href="https://youtu.be/cRgbdY3I2i4">video</a></td> +</tr> +<tr> +<td>13:40</td> +<td>13:55</td> +<td>David Püroja (CWI)</td> +<td>LDBC Social Network Benchmark: Interactive workload v2.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/david-puroja-ldbc-snb-interactive-workload-v2.0.pdf">slides</a></td> +</tr> +<tr> +<td>14:00</td> +<td>14:15</td> +<td>Angela Bonifati (Lyon 1 University)</td> +<td>The quest for schemas in graph databases – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/angela-bonifati-the-quest-for-schemas-in-graph-databases.pdf">slides</a>, <a href="https://youtu.be/VT7cx3Jp7V8">video</a></td> +</tr> +<tr> +<td>14:20</td> +<td>14:35</td> +<td>Matteo Lissandrini (Aalborg University)</td> +<td>Understanding graph data representations in triplestores – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/matteo-lissandrini-understanding-graph-data-representations-in-triplestores.pdf">slides</a>, <a href="https://youtu.be/xqVMJZfh_JU">video</a></td> +</tr> +<tr> +<td>14:40</td> +<td>14:55</td> +<td>Wim Martens (University of Bayreuth)</td> +<td>Path representations – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/wim-martens-path-representations.pdf">slides</a>, <a href="https://youtu.be/Ma-E5dwgf-E">video</a></td> +</tr> +<tr> +<td>15:00</td> +<td>15:20</td> +<td>Audrey Cheng (UC Berkeley)</td> +<td>TAOBench: An end-to-end benchmark for social network workloads – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/audrey-cheng-taobench.pdf">slides</a>, <a href="https://youtu.be/1p8AStxS3es">video</a></td> +</tr> +</tbody> +</table> +<h4 id="saturday-philadelphia-marriott-downtownhttpswwwmarriottcomen-ushotelsphldt-philadelphia-marriott-downtown-room-401-402-4th-floor">Saturday (<a href="https://www.marriott.com/en-us/hotels/phldt-philadelphia-marriott-downtown/">Philadelphia Marriott Downtown</a>, room 401-402, 4th floor)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Keith Hare (WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/keith-hare-property-graph-standards-process-and-timing.pdf">slides</a>, <a href="https://youtu.be/xFVD3LWnKlc">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>10:35</td> +<td>Leonid Libkin (ENS Paris)</td> +<td>Pattern matching in GQL and SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/leonid-libkin-pattern-matching-in-gql-and-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/OvGsa0qLANE">video</a></td> +</tr> +<tr> +<td>10:40</td> +<td>10:55</td> +<td>Petra Selmer (Neo4j/WG3)</td> +<td>An overview of GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/petra-selmer-towards-gql-v1-a-property-graph-query-language-standard.pdf">slides</a>, <a href="https://youtu.be/tncf2FgyIyo">video</a></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (LDBC/WG3)</td> +<td>GQL 2.0: A technical manifesto – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-gql-2.0-a-technical-manifesto.pdf">slides</a>, <a href="https://youtu.be/upIvpYy8C2g">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>George Fletcher (TU Eindhoven)</td> +<td>PG-Keys (LDBC Property Graph Schema Working Group) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/george-fletcher-pg-keys-keys-for-property-graphs.pdf">slides</a>, <a href="https://youtu.be/_W8-jOtcObc">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Arvind Shyamsundar (Microsoft)</td> +<td>Graph capabilities in Microsoft SQL Server and Azure SQL Database – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/arvind-shyamsundar-graph-capabilities-in-microsoft-sql-server-and-azure-database.pdf">slides</a>, <a href="https://youtu.be/xxV2BfZupGw">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>13:30</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Daniël ten Wolde (CWI)</td> +<td>Implementing SQL/PGQ in DuckDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/daniel-ten-wolde-implementing-sql-pgq-in-duckdb.pdf">slides</a>, <a href="https://youtu.be/JmSfU0BTH5w">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Oszkár Semeráth, Kristóf Marussy (TU Budapest)</td> +<td>Generation techniques for consistent, realistic, diverse, and scalable graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oszkar-semerath-generation-techniques-for-consistent-realistic-diverse-and-scalable-graphs.pdf">slides</a>, <a href="https://youtu.be/hB6j6mvh-vA">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Molham Aref (RelationalAI)</td> +<td>Graph Normal Form – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/molham-aref-graph-normal-form.pdf">slides</a>, <a href="https://youtu.be/-kP4Raqr5KA">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Naomi Arnold (Queen Mary University of London)</td> +<td>Temporal graph analysis of the far-right social network Gab – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/naomi-arnold-temporal-graph-analysis-of-the-far-right-social-network-gab.pdf">slides</a>, <a href="https://youtu.be/ugSkFlif4PE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:05</td> +<td>Domagoj Vrgoč (PUC Chile)</td> +<td>Evaluating path queries in MillenniumDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/domagoj-vrgoc-regular-path-queries-in-millenniumdb.pdf">slides</a>, <a href="https://youtu.be/_OzJ6vI7GNU">video</a></td> +</tr> +<tr> +<td>15:10</td> +<td>15:25</td> +<td>Pavel Klinov, Evren Sirin (Stardog)</td> +<td>Stardog&rsquo;s experience with LDBC – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/evren-sirin-stardog-experience-with-ldbc.pdf">slides</a>, <a href="https://youtu.be/CBrEeOTqGKM">video</a></td> +</tr> +</tbody> +</table> + + + + + Announcing the LDBC Financial Benchmark Task Force + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + Thu, 26 May 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + <p>We are delighted to announce the set up of the <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench) task force</a>.</p> +<p>The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.</p> +<p>Compared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.</p> +<p>The FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">Work Charter for FinBench</a></p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.</p> + + + + + Fourteenth TUC Meeting + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + Mon, 16 Aug 2021 16:00:00 +0200 + + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + <p>LDBC was hosting a one-day hybrid workshop, co-located with <a href="https://vldb.org/2021/">VLDB 2021</a> on <strong>August 16 (Monday) between 16:00–20:00 CEST</strong>.</p> +<p>The physical part of the workshop was held in room Akvariet 2 of the <a href="https://www.tivolihotel.com/">Tivoli Hotel</a> (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC&rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.</p> +<p>Talks were scheduled to be 10 minutes with a short Q&amp;A session. We had three sessions. Their schedules are shown below.</p> +<h4 id="16001725-cest-ldbc-updates-benchmarks-query-languages">[16:00–17:25 CEST] LDBC updates, benchmarks, query languages</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>16:00</td> +<td>Peter Boncz (CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a></td> +</tr> +<tr> +<td>16:05</td> +<td>Gábor Szárnyas (CWI)</td> +<td>Overview of LDBC benchmarks – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-benchmarks.pdf">slides</a></td> +</tr> +<tr> +<td>16:12</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>LDBC Social Network Benchmark results with TigerGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mingxi-wu-tigergraph-snb-preliminary-results.pdf">slides</a></td> +</tr> +<tr> +<td>16:24</td> +<td>Xiaowei Zhu (Ant Group)</td> +<td>Financial Benchmark proposal – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/xiaowei-zhu-financial-benchmark.pdf">slides</a></td> +</tr> +<tr> +<td>16:36</td> +<td>Petra Selmer (Neo4j)</td> +<td>Status report from the Existing Languages Working Group (ELWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/petra-selmer-elwg.pdf">slides</a>, <a href="https://youtu.be/I5A8VuFDhsA">video</a></td> +</tr> +<tr> +<td>16:48</td> +<td>Jan Hidders (Birkbeck)</td> +<td>Status report from the Property Graph Schema Working Group (PGSWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/jan-hidders-pgswg.pdf">slides</a>, <a href="https://youtu.be/iEbVi9T-HVk">video</a></td> +</tr> +<tr> +<td>17:00</td> +<td>Keith Hare (JCC Consulting)</td> +<td>Database Language Standards Structure and Process, SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/keith-hare-database-language-standards-structure-and-process-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/ZgFCuzods4g">video</a></td> +</tr> +<tr> +<td>17:12</td> +<td>Stefan Plantikow (GQL Editor)</td> +<td>Report on the GQL standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/stefan-plantikow-gql.pdf">slides</a>, <a href="https://youtu.be/z0pN5NwKsgc">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="17351845-cest-systems-and-data-structures">[17:35–18:45 CEST] Systems and data structures</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>17:35</td> +<td>Vasileios Trigonakis (Oracle Labs)</td> +<td>PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasileios-trigonakis-pgxd-adfs.pdf">slides</a>, <a href="https://youtu.be/cv2ZfWRBOek">video</a></td> +</tr> +<tr> +<td>17:47</td> +<td>Matthias Hauck (SAP)</td> +<td>JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/matthias-hauck-json-spatial-graph-sap-hana-cloud.pdf">slides</a>, <a href="https://youtu.be/dgpMJFho6Q8">video</a></td> +</tr> +<tr> +<td>17:59</td> +<td>Nikolay Yakovets (Eindhoven University of Technology)</td> +<td>AvantGraph – <a href="https://youtu.be/z0pN5NwKsgcttachments/nikolay-yakovets-avantgraph.pdf">slides</a>, <a href="https://youtu.be/9M9FOycovTw">video</a></td> +</tr> +<tr> +<td>18:11</td> +<td>Semih Salihoglu (University of Waterloo)</td> +<td>GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semih-salihoglu-graindb.pdf">slides</a>, <a href="https://youtu.be/FFK3y6vPHJs">video</a></td> +</tr> +<tr> +<td>18:23</td> +<td>Semyon Grigorev (Saint Petersburg University)</td> +<td>Context-free path querying: Obstacles on the way to adoption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semyon-grigorev-cfpq.pdf">slides</a>, <a href="https://youtu.be/pha1xIpEL3I">video</a></td> +</tr> +<tr> +<td>18:35</td> +<td>Per Fuchs (Technical University of Munich)</td> +<td>Sortledton: A universal, transactional graph data structure – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/per-fuchs-sortledton.pdf">slides</a>, <a href="https://youtu.be/33ZjsNN0hhU">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="1855-2000-cest-high-level-approaches-and-benchmarks">[18:55-20:00 CEST] High-level approaches and benchmarks</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>18:55</td> +<td>Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)</td> +<td>Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/angelos-christos-anadiotis-investigative-journalism-graph-data-management.pdf">slides</a>, <a href="https://youtu.be/a1VYjyec8dg">video</a></td> +</tr> +<tr> +<td>19:07</td> +<td>Vasia Kalavri (Boston University)</td> +<td>Learning to partition unbounded graph streams – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasia-kalavri-learning-to-partition-unbounded-graph-streams.pdf">slides</a>, <a href="https://youtu.be/PTlUABKWniA">video</a></td> +</tr> +<tr> +<td>19:19</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Towards a Hybrid OLTP-OLAP Graph Benchmark – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/muhammad-attahir-jibril-hybrid-oltp-olap-benchmark.pdf">slides</a>, <a href="https://youtu.be/tMBVszTSJXc">video</a></td> +</tr> +<tr> +<td>19:31</td> +<td>Riccardo Tommasini (University of Tartu)</td> +<td>An outlook on Benchmarks for Graph Stream Processing – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/riccardo-tommasini-graph-stream-processing-benchmarks.pdf">slides</a>, <a href="https://youtu.be/HabvJvPXsLc">video</a></td> +</tr> +<tr> +<td>19:43</td> +<td>Mohamed Ragab (University of Tartu)</td> +<td>Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mohamed-ragab-benchranking.pdf">slides</a>, <a href="https://youtu.be/mZ8LhGUq7Wg">video</a></td> +</tr> +</tbody> +</table> + + + + + Thirteenth TUC Meeting + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + Tue, 30 Jun 2020 14:00:00 +0000 + + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + <p>LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.</p> +<h3 id="snb-task-force">SNB Task Force</h3> +<ul> +<li>Progress report +<ul> +<li>ACID compliance test suite</li> +<li>Integrating deletions to Datagen</li> +<li>Migrating Datagen to Spark</li> +<li>Redesign of BI read queries</li> +<li>Extensions to the driver</li> +</ul> +</li> +<li>Ongoing work +<ul> +<li>Datagen: tuning the distribution of deletes</li> +<li>Interactive 2.0 workload</li> +<li>BI 1.0 workload</li> +</ul> +</li> +</ul> +<p>Zoom links will be sent through email.</p> + + + + + Speeding Up LDBC SNB Datagen + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + Fri, 12 Jun 2020 00:00:00 +0000 + + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + <p>LDBC&rsquo;s <a href="#references">Social Network Benchmark [4]</a> (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems&rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.</p> +<p>LDBC SNB provides <a href="https://github.com/ldbc/ldbc_snb_datagen">Datagen</a> (Data Generator), which produces synthetic datasets, mimicking a social network&rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).</p> +<h2 id="overview">Overview</h2> +<p>The benchmark&rsquo;s specification describes a social network <a href="https://github.com/ldbc/ldbc_snb_docs/blob/9253abbde94ec7eaccd366c5d4c15cca30752e36/figures/schema-comfortable.pdf">data model</a> which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see <a href="#references">References</a>.</p> +<p>Datasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).</p> +<p><img src="datagen_flow.png" alt=""> \ <em>Figure 1. LDBC SNB Datagen Process on Hadoop</em></p> +<p>In the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.</p> +<p><em>Note: The diagram shows the call sequence as implemented. All steps are sequential &ndash; including the relationship generation &ndash;, even in cases when the data dependencies would allow for parallelization.</em></p> +<p>Entities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup> A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this <em>edge as property</em> representation makes the data harder to handle in SQL than it would be with a flat join table.</p> +<p>Entity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup></p> +<p>Serialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.</p> +<h2 id="motivations-for-the-migration">Motivations for the migration</h2> +<p>The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:</p> +<ul> +<li> +<p><strong>Better memory utilization:</strong> MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.</p> +</li> +<li> +<p><strong>Smaller codebase:</strong> The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.</p> +</li> +<li> +<p><strong>Small entry cost:</strong> Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.</p> +</li> +<li> +<p><strong>Incremental improvements:</strong> Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.</p> +</li> +<li> +<p><strong>OSS, commodity:</strong> Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.</p> +</li> +</ul> +<h2 id="first-steps">First steps</h2> +<p>The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.</p> +<ul> +<li> +<p><strong>Use your memory:</strong> A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using <code>MEMORY_AND_DISK</code>). In short, the default caching strategy was used everywhere.</p> +</li> +<li> +<p><strong>Regression tests:</strong> Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.</p> +</li> +<li> +<p><strong>Thread-safety concerns:</strong> Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn&rsquo;t a concern in the original implementation due to the fact that MapReduce doesn&rsquo;t use thread-based parallelization for mappers and reducers.<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, <sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup> which makes it somewhat harder to find potential unguarded shared variables.</p> +</li> +</ul> +<h2 id="case-study-person-ranking">Case study: Person ranking</h2> +<p>Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the <a href="#references">S3G2 paper [3]</a>.</p> +<h3 id="the-original-mapreduce-version">The original MapReduce version</h3> +<p><img src="person_ranking.svg" alt=""> \ <em>Figure 2. Diagram of the MapReduce code for ranking persons</em></p> +<p>The implementation, shown in pseudocode above, works as follows:</p> +<ol> +<li>The equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low.</li> +<li>The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate &ldquo;side-channel&rdquo; file upon the completion of a reduce task.</li> +<li>In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person.</li> +</ol> +<p>Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.</p> +<h3 id="the-migrated-version">The migrated version</h3> +<p>Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.</p> +<h2 id="benchmarks">Benchmarks</h2> +<p>Benchmarks were carried out on AWS <a href="https://aws.amazon.com/emr/">EMR</a>, originally utilising <a href="https://aws.amazon.com/ec2/instance-types/i3/">i3.xlarge</a> instances because of their fast NVMe SSD storage and ample amount of RAM.</p> +<p>The application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.<sup id="fnref:5"><a href="#fn:5" class="footnote-ref" role="doc-noteref">5</a></sup> The MapReduce results were as follows:</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>16</td> +<td>1.60</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>34</td> +<td>1.13</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>40</td> +<td>1.20</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>44</td> +<td>1.32</td> +</tr> +</tbody> +</table> +<p>It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.</p> +<p><img src="mr_sf100_cpu_load.png" alt=""> <br> +<em>Figure 3. CPU Load for the Map Reduce cluster is bursty and less than<br> +50% on average (SF100, 2nd graph shows master)</em></p> +<p><img src="mr_sf100_mem_free.png" alt=""> <br> +<em>Figure 4. The job only starts to consume memory when already 10 minutes<br> +into the run (SF100, 2nd graph shows master)</em></p> +<p>Let&rsquo;s see how Spark fares.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>10</td> +<td>1.00</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>21</td> +<td>0.70</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>27</td> +<td>0.81</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>36</td> +<td>1.08</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +</tbody> +</table> +<p>A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn&rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.</p> +<p><img src="spark_sf100_cpu_load.png" alt=""> <br> +<em>Figure 5. Full CPU utilization for Spark (SF100, last graph shows<br> +master)</em></p> +<p><img src="spark_sf100_mem_free.png" alt=""> <br> +<em>Figure 6. Spark eats up memory fast (SF100, 2nd graph shows master)</em></p> +<p>i3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it <em>only</em> has a 300 GB SSD.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>16</td> +<td>0.48</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>21</td> +<td>0.63</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>26</td> +<td>0.78</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +<tr> +<td>10000</td> +<td>303</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<p>The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.</p> +<h2 id="next-steps">Next steps</h2> +<p>The next improvement is refactoring the serializers so they use Spark&rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.</p> +<p>As already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.</p> +<p>The Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for <a href="#references">generating delete operations [1]</a>. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!</p> +<h2 id="references">References</h2> +<p>[1] <a href="https://ldbcouncil.org/docs/papers/datagen-deletes-grades-nda-2020.pdf">Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark&rsquo;s Data Generator</a></p> +<p>[2] <a href="https://www.youtube.com/watch?v=ZQOLuCOOpSI">9th TUC Meeting &ndash; LDBC SNB Datagen Update &ndash; Arnau Prat (UPC)</a> - <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">slides</a></p> +<p>[3] <a href="https://research.vu.nl/en/publications/s3g2-a-scalable-structure-correlated-social-graph-generator">S3G2: a Scalable Structure-correlated Social Graph Generator</a></p> +<p>[4] <a href="https://arxiv.org/abs/2001.02299">The LDBC Social Network Benchmark</a></p> +<p>[5] <a href="https://ldbcouncil.org/">LDBC</a> - <a href="https://github.com/ldbc">LDBC GitHub organization</a></p> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>Also makes it easier to map to a tabular format thus it is a SQL friendly representation.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>It&rsquo;s hard to imagine this done declaratively in SQL.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>Instead, multiple YARN containers have to be used if you want to parallelize on the same machine.&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>Although editors usually render these using different font styles.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:5"> +<p>With the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.&#160;<a href="#fnref:5" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Twelfth TUC Meeting + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + Fri, 05 Jul 2019 08:30:00 +0100 + + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + <p>LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event on the last Friday of <strong><a href="https://sigmod2019.org/">SIGMOD/PODS 2019</a></strong> in Amsterdam, The Netherlands, in the conference venue of <strong><a href="http://sigmod2019.org/conf_venue">Beurs van Berlage</a></strong>. The room is the Mendes da Silva kamer. Please check its tips for <strong><a href="http://sigmod2019.org/accommodation">accommodation in Amsterdam</a></strong>.</p> +<p>Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2019">GRADES-NDA 2019</a>, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.</p> +<p><strong>Talk proposals can be sent to Peter Boncz</strong>, who is also the local organizer. <strong>Please also send your slides to this email for archiving on this site.</strong></p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting, there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).</p> +<p>The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (<strong>room: Mendes da Silva kamer</strong>):</p> +<p>08:30-10:30 LDBC Board Meeting (non-public)</p> +<p>10:30-11:00 Coffee</p> +<p>11:00-12:45 Session 1: Graph Benchmarks</p> +<ul> +<li> +<p>11:00-11:05 Welcome &amp; introduction</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230404.pdf">11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706117.pdf">11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706118.pdf">12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706130.pdf">12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics</a></p> +</li> +</ul> +<p>12:45-14:00 Lunch</p> +<p>14:00-16:05 Session 2: Graph Query Languages</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706120.pdf">14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706121.pdf">14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706122.pdf">report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706119.pdf">14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706129.pdf">15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph&rsquo;s query language</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230401.pdf">15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language</a></p> +</li> +</ul> +<p>16:05-16:30 Coffee</p> +<p>16:30-17:50 Session 3: Graph System Performance</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111968258.pdf">16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706124.pdf">16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity</a> <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706116.pptx">pptx</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706128.pdf">17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706133.pdf">17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data</a></p> +</li> +</ul> +<p>If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.</p> + + + + + Eleventh TUC Meeting + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + Fri, 08 Jun 2018 08:30:00 -0500 + + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + <p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event preceding the <a href="https://sigmod2018.org/">SIGMOD/PODS 2018</a> conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2018/">GRADES-NDA 2018</a> as well, so you might combine travel.</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (<a href="mailto:boncz@cwi.nl">boncz@cwi.nl</a>) and Larri (<a href="mailto:larri@ac.upc.ed">larri@ac.upc.edu</a>). Local organizer is Juan Sequeda (<a href="mailto:juanfederico@gmail.com">juanfederico@gmail.com</a>).</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00:</p> +<ul> +<li> +<p>10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090478.pdf">10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo</a></p> +</li> +<li> +<p>11:00-11:30 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090466.pdf">11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090463.pdf">11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090472.pdf">12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI</a></p> +</li> +<li> +<p>12:45-14:00 lunch</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090474.pdf">14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090481.pdf">14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090485.pdf">14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service</a></p> +</li> +<li> +<p>15:15-15:40 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99811329.pdf">15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99287041.pdf">16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99745793.pdf">16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph</a></p> +</li> +<li> +<p><a href="http://relational.ai/">16:55-17:20 Molham Aref (relational.ai)</a>) - Introducing.. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99418113.pdf">relational.ai</a></p> +</li> +<li> +<p>18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701</p> +</li> +</ul> +<h3 id="location">Location</h3> +<p>The TUC will be held at the <a href="https://www.cs.utexas.edu/">University of Texas at Austin, Department of Computer Science</a> in the <a href="https://www.google.com/maps/place/The+University+of+Texas:+Department+of+Computer+Science/@30.2860955,-97.737582,18z/data=!4m5!3m4!1s0x0:0x12edecc8226b3241!8m2!3d30.2862279!4d-97.7365348">Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712</a> Room: GDC 6.302</p> +<p>The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.</p> +<h3 id="from-austin-to-sigmodpods-houston-on-saturday-june-9">From Austin to SIGMOD/PODS (Houston) on Saturday June 9</h3> +<p>Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.</p> +<h4 id="bus">Bus</h4> +<p>One option is to take a <a href="https://us.megabus.com/journey-planner/journeys?days=1&amp;concessionCount=0&amp;departureDate=2018-06-09&amp;destinationId=318&amp;inboundOtherDisabilityCount=0&amp;inboundPcaCount=0&amp;inboundWheelchairSeated=0&amp;nusCount=0&amp;originId=320&amp;otherDisabilityCount=0&amp;pcaCount=0&amp;totalPassengers=1&amp;wheelchairSeated=0">MegaBus that departs from downtown Austin and arrives at downtown Houston</a>.</p> +<p>There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).</p> +<p>If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).</p> + + + + + Tenth TUC Meeting + https://ldbcouncil.org/event/tenth-tuc-meeting/ + Fri, 01 Sep 2017 10:30:00 +0100 + + https://ldbcouncil.org/event/tenth-tuc-meeting/ + <p>This will be a one-day event at the <a href="http://www.vldb.org/2017">VLDB 2017</a> conference in Munich, Germany on September 1, 2017.</p> +<p>Topics and activities of interest in these TUC meetings are:</p> +<ul> +<li>Presentation on graph data management usage scenarios.</li> +<li>Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at <a href="mailto:adiaz@ac.upc.edu">adiaz@ac.upc.edu</a> to register; registration is free, but required.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00</p> +<p>10:30-12:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87588865.pdf">Peter Boncz (CWI): GraphQL task force update - the G-CORE proposal</a> (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868018.pptx">pptx</a>)</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868008.pdf">Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload</a></li> +<li>Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868014.pdf">LDBC Graphalytics v0.9</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868013.pdf">Graphalytics Global Competition and Graphalytics Custom Benchmark</a></li> +</ul> +<p>12:00-13:30: lunch break</p> +<p>13:30-15:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868024.pdf">Arnau Prat (UPC): Datasynth: Democratizing property graph generation</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868026.pdf">Marcus Paradies (SAP): SAP HANA GraphScript</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031809.pdf">Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform</a></li> +<li>Gaétan Hains (Huawei): Cost semantics for graph queries</li> +</ul> +<p>15:00-15:30: break</p> +<p>15:30-17:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031812.pdf">Petra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87195650.pdf">Markus Kaindl (Springer): SN SciGraph &ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain</a></li> +<li>Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks</li> +<li>Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset</li> +</ul> +<p>Speakers should aim for a <strong>20-minute talk</strong>.</p> +<p>Further:</p> +<ul> +<li>on Friday evening (19:00-21:00) there will be a <strong>social dinner</strong> at <a href="https://www.loewenbraeukeller.com/en/pub-and-beer-garden/">Löwenbräukeller</a>, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).</li> +<li>on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.</li> +</ul> +<h3 id="venue">Venue</h3> +<p>The Technical University of Munich (TUM) is hosting that week the <a href="http://www.vldb.org/2017">VLDB conference</a>; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.</p> +<p>The TUC meeting will be held in in <strong>Room 2607</strong> alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).</p> +<p><strong>address: Technische Universität München (TUM), Arcisstraße 21, 80333 München</strong></p> +<p><a href="https://www.google.nl/maps/place/Technische+Universit%C3%A4t+M%C3%BCnchen/@48.14966,11.5656715,17z/data=!3m1!4b1!4m5!3m4!1s0x479e7261336d8c11:0x79a04d44dc5bf19d!8m2!3d48.14966!4d11.5678602?hl=en">Google Maps</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920002.jpg" alt=""><br> +<img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920003.jpg" alt=""></p> + + + + + Ninth TUC Meeting + https://ldbcouncil.org/event/ninth-tuc-meeting/ + Thu, 09 Feb 2017 15:07:18 -0400 + + https://ldbcouncil.org/event/ninth-tuc-meeting/ + <p>LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> in Walldorf, Germany on February 9+10, 2017.</p> +<p>This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>;</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.</p> +<p>Thursday evening (19:00-21:00) there will be a <strong>social dinner</strong> in Heidelberg.</p> +<p>Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.</p> +<h4 id="social-dinner">Social Dinner</h4> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235334.png" alt=""></p> +<p><strong>Address: Hauptstraße 217, 69117 Heidelberg</strong><br> +<strong>Time: 19:00 / 7pm</strong></p> +<p>(See attachments at the bottom of the page)</p> +<h5 id="thursday">Thursday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>Welcome and logistics - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>9:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235329.pdf">Intro + state of the LDBC - Josep Lluis Larriba Pey</a> (UPC)</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235338.pdf">LDBC Graph QL task force</a> - Hannes Voigt (TU Dresden)</td> +</tr> +<tr> +<td>9:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235335.pdf">PGQL Status Update and Comparison to LDBC&rsquo;s Graph QL proposals</a> - Oskar van Rest (Oracle Labs)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628546.pdf">Adding shortest-paths to MonetDB</a> - Dean de Leo (CWI)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431939.pdf">Evolving Cypher for processing multiple graphs</a> - Stefan Plantikow (Neo Technology)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235346.pdf">Standardizing Graph Database Functionality - An Invitation to Collaborate</a> - Jan Michels (ISO/ANSI SQL, Oracle)&quot;</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235343.pdf">Dgraph: Graph database for production environment</a> - Tomasz Zdybal (Dgraph.io)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431945.pdf">LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap</a> - Alexandru Iosup (TU Delft)</td> +</tr> +<tr> +<td>13:20</td> +<td>LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)</td> +</tr> +<tr> +<td>13:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">LDBC SNB Datagen Update</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431943.pdf">LDBC SNB Business Intelligence Workload: Chokepoint Analysis</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431947.pdf">LDBC Benchmark Cost Specification</a> (+discussion) - Moritz Kaufmann (TU Munich)</td> +</tr> +<tr> +<td>14:40</td> +<td>coffee break</td> +</tr> +<tr> +<td>15:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76316673.pdf">EYWA: the Distributed Graph Engine in Huawei MIND Platform</a> (Yinglong Xia)</td> +</tr> +<tr> +<td>15:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431949.pdf">Graph Processing in SAP HANA</a> - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>15:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628563.pdf">Distributed Graph Analytics with Gradoop</a> - Martin Junghanns (Univ Leipzig)</td> +</tr> +<tr> +<td>16:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152834.pdf">Distributed graph flows: Cypher on Flink and Gradoop</a> - Max Kießling (Neo Technology)</td> +</tr> +<tr> +<td>16:30</td> +<td>closing - Peter Boncz</td> +</tr> +<tr> +<td>17:30</td> +<td>end</td> +</tr> +</tbody> +</table> +<h5 id="friday">Friday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>welcome - Peter Boncz</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152833.pdf">Graph processing in obi4wan</a> - Frank Smit (OBI4WAN)</td> +</tr> +<tr> +<td>9:40</td> +<td>Graph problems in the space domain - Albrecht Schmidt (ESA)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75792387.pdf">Medical Ontologies for Healthcare</a> - Michael Neumann (SAP)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76447745.pdf">The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries</a> - Gabor Szarnyas (BME)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76021761.pdf">Efficient sparse matrix computations and their generalization to graph computing applications</a> - Albert-Jan Yzelman (Huawei)</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152837.pdf">Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge</a> - Atanas Kyriakov (Ontotext)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td>LDBC Board of Directors Meeting</td> +</tr> +<tr> +<td>17:00</td> +<td>end</td> +</tr> +</tbody> +</table> +<h3 id="logistics">Logistics</h3> +<h5 id="important-things-to-know"><strong>Important things to know</strong></h5> +<p>The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">link</a></p> +<h5 id="venue"><strong>Venue</strong></h5> +<p>The TUC meeting will be held in the <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> at the SAP Guesthouse Kalipeh (<a href="https://www.kalipeh.com">https://www.kalipeh.com</a>). The address is:</p> +<p><strong>WDF 44 / SAP Guesthouse Kalipeh<br> +Dietmar-Hopp-Allee 15<br> +69190 Walldorf<br> +Germany</strong></p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p><a href="https://www.google.com/maps/place/SAP+Guesthouse+Kalipeh/@49.2951903,8.6436224,17z/data=!3m1!4b1!4m5!3m4!1s0x4797bea343a566af:0xd70698f3503ab74b!8m2!3d49.2951868!4d8.6458111">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/69042180.png" alt=""></p> +<h4 id="getting-there"><strong>Getting there</strong></h4> +<h5 id="by-plane"><strong>By plane</strong></h5> +<p>There are two airports close to SAP&rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.</p> +<p>When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.</p> +<p>The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).</p> +<p>Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).</p> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<p><strong>Traveling from Frankfurt Airport (FRA) to SAP Headquarters:</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.&rdquo;</li> +<li>Follow the A5 to &ldquo;Basel/Karlsruhe/Heidelberg.&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<p>(Should you use a navigational system which does not recognize the street name &lsquo;Dietmar-Hopp-Allee&rsquo; please use &lsquo;Neurottstrasse&rsquo; instead.)</p> +<p><strong>Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:</strong></p> +<p>To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.</p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A8/Stuttgart/B27.&rdquo;</li> +<li>Stay on A8 and follow the sign for &ldquo;Karlsruhe/Heilbronn/Singen/A8.&rdquo;</li> +<li>Follow A8 to Karlsruhe.</li> +<li>Take exit 41 &ndash; &ldquo;Dreieck Karlsruhe&rdquo; to merge onto A5 toward &ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<h6 id="parking"><strong>Parking</strong></h6> +<p>The closest parking lot to the event location is P7 (see figure above).</p> +<h5 id="by-train"><strong>By Train</strong></h5> +<p>As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.</p> +<p><strong>From Frankfurt Airport (FRA) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to Terminal 1, level T (see overview in Appendix).</li> +<li>Go to the AIRail Terminal &ndash; &ldquo;Fernbahnhof&rdquo; (long-distance trains).</li> +<li>Choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP.&rdquo; It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> +<p><strong>From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to the S-Bahn station in the airport, following the sign (station is called &ldquo;Stuttgart Flughafen/Messe&rdquo;).</li> +<li>Take train number S2 or S3 to &ldquo;Stuttgart Hauptbahnhof&rdquo; (main station).</li> +<li>From Stuttgart Hauptbahnhof choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP&rdquo;. It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> + + + + + LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + Tue, 06 Sep 2016 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + <p>LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.</p> +<p>LDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.</p> +<p>Tim Hegeman of <a href="https://www.tudelft.nl">TU Delft</a> is today presenting the technical paper describing LDBC Graphalytics at the important <a href="https://www.vldb.org/conference.html">VLDB</a> (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.</p> +<p>LDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.</p> +<p>Learn more: [/ldbc-graphalytics](LDBC Graphalytics)</p> +<p>GitHub: <a href="https://github.com/tudelft-atlarge/graphalytics">https://github.com/tudelft-atlarge/graphalytics</a></p> + + + + + Eighth TUC Meeting + https://ldbcouncil.org/event/eighth-tuc-meeting/ + Wed, 22 Jun 2016 14:45:20 -0400 + + https://ldbcouncil.org/event/eighth-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event/eighth-tuc-meeting/attachments at <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a> in Redwood Shores facility on <strong>Wednesday and Thursday June 22-23, 2016</strong>.</p> +<p>This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:</p> +<ul> +<li>Two day event/eighth-tuc-meeting/attachments with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify Oracle security in advance, registration requests need to be in by <strong>June 12</strong>.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +<li><a href="#accommodation">Accommodation</a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.</p> +<h6 id="wednesday-22th-of-june-2016-room-203"><strong>Wednesday, 22th of June 2016 (<strong>Room 203)</strong></strong></h6> +<p>(full morning: LDBC Board of Directors meeting)</p> +<ul> +<li>12:00 - 13:00 Lunch (provided)</li> +<li>13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome.</li> +<li>13:30 - 14:00 Peter Boncz (CWI) <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133891.pdf">LDBC introduction and status update</a>.</li> +<li>14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</li> +<li>14:00 Arnau Prat (DAMA-UPC). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133902.pdf">Social Network Benchmark, Interactive workload</a>.</li> +<li>14:30 Tim Hegeman (TU Delft). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133893.pdf">Social Network Benchmark, Analytics workload</a>.</li> +<li>15:00 - 15:30 Coffee break</li> +<li>15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133897.pdf">Graphing Healthcare Networks: Data, Analytics, and Use Cases.</a></li> +<li>16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133901.pdf">Frappé: Querying and managing evolving code dependency graphs</a>.</li> +<li>16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133895.pdf">UniProt: challenges of a public SPARQL endpoint.</a></li> +</ul> +</li> +<li>17:00 - 18:30 Graph Technologies (chair Peter Boncz) +<ul> +<li>17:00 Eugene I. Chong (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133904.pdf">Balancing Act to improve RDF Query Performance in Oracle Database</a>.</li> +<li>17:30 Lijun Chang (University of New South Wales). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133906.pdf">Efficient Subgraph Matching by Postponing Cartesian Products</a>.</li> +<li>18:00 Weining Qian (East China Normal University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133908.pdf">On Statistical Characteristics of Real-Life Knowledge Graphs</a>.</li> +</ul> +</li> +</ul> +<h6 id="thursday-23th-of-june-2016-room-203"><strong>Thursday, 23th of June 2016 (Room 203)</strong></h6> +<ul> +<li>08:00 - 09:00 Breakfast (provided)</li> +<li>09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) +<ul> +<li>09:00 Peter Boncz (CWI). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133896.pdf">Query Language Task Force status</a></li> +<li>09:45 Marcus Paradies (SAP). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297729.pdf">Social Network Benchmark, Business Intelligence workload</a></li> +</ul> +</li> +<li>10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) +<ul> +<li>10:00 Sergey Edunov (Facebook). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297731.pdf">Generating realistic trillion-edge graphs</a></li> +<li>10:30 George Fletcher (TU Eindhoven). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297733.pdf">An open source framework for schema-driven graph instance and graph query workload generation</a>.</li> +<li>11:00 Yinglong Xia (Huawei Research America): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297735.pdf">An Efficient Big Graph Analytics Platform</a>.</li> +<li>11:30 Zhe Wu (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297737.pdf">Bridging RDF Graph and Property Graph Data Models</a></li> +</ul> +</li> +<li>12:00 - 13:30 Lunch (provided)</li> +<li>13:30 - 15:30 Graph Technologies (chair Arnau Prat) +<ul> +<li>13:30 Tobias Lindaaker (Neo Technology). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297740.pdf">An open standard for graph queries: the Cypher contribution</a></li> +<li>14:00 Arash Termehchy (Oregon State University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297742.pdf">Toward Representation Independent Graph Querying &amp; Analytics</a></li> +<li>14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297745.pdf">In the service of the federation</a></li> +<li>15:00 Nandish Jayaram (Pivotal). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297747.pdf">Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs</a>.</li> +</ul> +</li> +<li>15:30 - 16:00 Coffee break</li> +<li>16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>16:00 Jans Aasman (Franz Inc.). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428806.pdf">Semantic Data Lake for Healthcare</a></li> +<li>16:15 Kevin Madden (Tom Sawyer Software). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428812.pdf">Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis</a></li> +<li>16:45 Juan Sequeda (Capsenta). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428810.pdf">Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources</a></li> +<li>17:15 Kevin Wilkinson (Hewlett Packard Labs). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428808.pdf">LDBC SNB extensions</a></li> +</ul> +</li> +<li>17:45 - 18:15 Closing discussion</li> +</ul> +<h6 id="friday-24th-of-june-2016-room-105"><strong>Friday, 24th of June 2016 (Room 105)</strong></h6> +<p>At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (<strong>GRADES16</strong>).</p> +<p>18:30 social dinner for GRADES registrants (place to be announced)</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>22nd and 23rd June 2016</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a></p> +<p>The address is:</p> +<p><strong>Room 203 (Wed-Thu) &amp; Room 105 (Fri)</strong><br> +<strong>Oracle Conference Center</strong><br> +<strong>350 Oracle Parkway</strong><br> +<strong>Redwood City, CA 94065, USA</strong></p> +<p><strong>Maps and situation</strong></p> +<p><a href="https://www.google.com/maps/place/Oracle+Conference+Center/@37.5322827,-122.2667034,17z/data=!3m1!4b1!4m2!3m1!1s0x808f98b5450e8ca3:0xdc75e8b1c02bbb91">Google Maps link</a></p> +<p>Oracle Campus map:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/40927234.jpg" alt=""></p> +<h5 id="getting-there"><strong>Getting there</strong></h5> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<ul> +<li>[Southbound] <strong>-</strong> Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +<li>[Northbound] <strong>-</strong> Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +</ul> +<h5 id="parking"><strong>Parking</strong></h5> +<p>The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.</p> +<h5 id="public-transport"><strong>Public transport</strong></h5> +<p>Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.</p> +<ul> +<li>Caltrain timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/weekdaytimetable.html</a></li> +<li>Oracle Shuttle timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html</a></li> +</ul> +<p>You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.</p> +<p>Alternatively, SamTrans (San Mateo County&rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.</p> + + + + + LDBC and Apache Flink + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + Mon, 16 Nov 2015 14:47:00 +0000 + + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + <p>Apache Flink <a href="#references">[1]</a> is an open source platform for distributed stream and batch data processing. Flink&rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.</p> +<p><img src="https://flink.apache.org/img/flink-stack-small.png" alt=""></p> +<p>Flink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.</p> +<p>The following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">&gt;</span> text <span style="color:#f92672">=</span> env<span style="color:#f92672">.</span><span style="color:#a6e22e">fromElements</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the past controls the future.&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the present controls the past.&#34;</span><span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>Tuple2<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">,</span> Integer<span style="color:#f92672">&gt;&gt;</span> wordCounts <span style="color:#f92672">=</span> text +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">flatMap</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> LineSplitter<span style="color:#f92672">())</span> <span style="color:#75715e">// splits the line and outputs (word,1) +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>tuples<span style="color:#f92672">.</span><span style="color:#a6e22e">groupBy</span><span style="color:#f92672">(</span><span style="color:#ae81ff">0</span><span style="color:#f92672">)</span> <span style="color:#75715e">// group by word +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">.</span><span style="color:#a6e22e">sum</span><span style="color:#f92672">(</span><span style="color:#ae81ff">1</span><span style="color:#f92672">);</span> <span style="color:#75715e">// sum the 1&#39;s +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>wordCounts<span style="color:#f92672">.</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop <a href="#references">[2]</a>. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing <a href="#references">[3]</a>. Using the class <code>LDBCToFlink</code>, LDBC output files can be read directly from HDFS or from the local file system:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;hdfs:///ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> <span style="color:#75715e">// or &#34;/path/to/social_network&#34; +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> vertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> edges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The tuple classes <code>LDBCVertex</code> and <code>LDBCEdge</code> hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.</p> +<p>Each <code>LDBCVertex</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all vertices * a vertex label (e.g. <code>Person</code>, <code>Comment</code>) * a key-value map of properties including also multivalued properties<br> +(e.g. <code>Person.email</code>)</li> +</ul> +<p>Each <code>LDBCEdge</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all edges</li> +<li>an edge label (e.g. <code>knows</code>, <code>likes</code>)</li> +<li>a source vertex identifier</li> +<li>a target vertex identifier</li> +<li>a key-value map of properties</li> +</ul> +<p>The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label <code>Person</code> and edges with the label <code>knows</code> and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub <a href="#references">[4]</a>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter vertices with label “Person” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> ldbcVertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">VERTEX_CLASS_PERSON</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter edges with label “knows” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> ldbcEdges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">EDGE_CLASS_KNOWS</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly vertices suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> vertices <span style="color:#f92672">=</span> ldbcVertices<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly edges suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Edge<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;&gt;</span> edges <span style="color:#f92672">=</span> ldbcEdges<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>Graph<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;</span> g <span style="color:#f92672">=</span> Graph<span style="color:#f92672">.</span><span style="color:#a6e22e">fromDataSet</span><span style="color:#f92672">(</span>vertices<span style="color:#f92672">,</span> edges<span style="color:#f92672">,</span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// run connected components on the subgraph for 10 iterations +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> components <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> g<span style="color:#f92672">.</span><span style="color:#a6e22e">run</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> ConnectedComponents<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// print the component id of the first 10 vertices +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>components<span style="color:#f92672">.</span><span style="color:#a6e22e">first</span><span style="color:#f92672">(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">).</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The ldbc-flink-import tool is available on Github <a href="#references">[3]</a> and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.</p> +<p>If you want to learn more about Apache Flink, a good starting point is the main documentation <a href="#references">[5]</a> and if you have any question feel free to ask the official mailing lists.<br> +There is also a nice set of videos <a href="#references">[6]</a> available from the latest Flink Forward conference.</p> +<h4 id="references">References</h4> +<p>[1] <a href="http://flink.apache.org/">http://flink.apache.org/</a></p> +<p>[2] <a href="https://github.com/dbs-leipzig/gradoop">https://github.com/dbs-leipzig/gradoop</a></p> +<p>[3] <a href="https://github.com/s1ck/ldbc-flink-import">https://github.com/s1ck/ldbc-flink-import</a></p> +<p>[4] <a href="https://gist.github.com/s1ck/b33e6a4874c15c35cd16">https://gist.github.com/s1ck/b33e6a4874c15c35cd16</a></p> +<p>[5] <a href="https://ci.apache.org/projects/flink/flink-docs-release-0.10/">https://ci.apache.org/projects/flink/flink-docs-release-0.10/</a></p> +<p>[6] <a href="https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA">https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA</a></p> + + + + + Seventh TUC Meeting + https://ldbcouncil.org/event/seventh-tuc-meeting/ + Mon, 09 Nov 2015 14:17:30 -0400 + + https://ldbcouncil.org/event/seventh-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="http://www.research.ibm.com/labs/watson">IBM&rsquo;s TJ Watson</a> facility on <strong>Monday and Tuesday November 9/10, 2015.</strong></p> +<p>This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify IBM security in advance, registration requests need to be in by Nov 1.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a><br> +- <a href="#date"><strong>Date</strong></a><br> +- <a href="#venue"><strong>Venue</strong></a><br> +- <a href="#maps-and-situation"><strong>Maps and situation</strong></a><br> +- <a href="#getting-there"><strong>Getting there</strong></a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>Monday, 9th of November 2015</strong></p> +<p>8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)</p> +<p>9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)</p> +<p>9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</p> +<p>9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload</p> +<p>10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload</p> +<p>10:30-11:00 Coffee break</p> +<p>11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)</p> +<p>11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.</p> +<p>11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.</p> +<p>12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)</p> +<p>14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox</p> +<p>14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today&rsquo;s architectures</p> +<p>15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G</p> +<p>15:30-16:00 Coffee break</p> +<p>16:00 - 17:00 Technologies (chair Irini Fundulaki)</p> +<p>16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store</p> +<p>16:30 David Ediger (GeorgiaTech). STINGER</p> +<p>17:00 Gary King (Franz Inc.). AllegroGraph&rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties</p> +<p>17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics</p> +<p>18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase</p> +<p>19:00 Social dinner</p> +<p><strong>Tuesday 10th November 2015</strong></p> +<p>9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)</p> +<p>9:00 Philip Rathle (Neo). On openCypher</p> +<p>9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification</p> +<p>9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions</p> +<p>10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation</p> +<p>10:30 - 11:00 Coffee break</p> +<p>11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)</p> +<p>11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL</p> +<p>11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,</p> +<p>11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis</p> +<p>12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>9th and 10th November 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the IBM Thomas J Watson Research Center.<br> +The address is:</p> +<p><strong>IBM Thomas J Watson Research Center</strong><br> +<strong>1101 Kitchawan Rd,</strong><br> +<strong>Yorktown Heights, NY 10598, USA</strong></p> +<p>If you are using a <em>GPS system</em>, please enter <strong>&ldquo;200 Aqueduct Road, Ossining NY, 10562&rdquo;</strong> for accurate directions to the lab entrance. You may also want to check the routing online.</p> +<p>The meeting will take place in the <em>Auditorium</em> on November 9th, and in Meeting Room <em>20-043</em> on November 10th.</p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p>You are highly suggested to <strong>rent a car</strong> for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through <a href="http://www.orbitz.com">www.orbitz.com</a>, or <a href="http://www.expedia.com">www.expedia.com</a> Feel free to email <a href="mailto:yxia@us.ibm.com">yxia@us.ibm.com</a> for any questions.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/seventh-tuc-meeting/attachments/6882333/15926330.png" alt=""></p> +<h6 id="getting-there"><strong>Getting there</strong></h6> +<p><strong>Upper and Eastern New England</strong></p> +<p>Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.</p> +<p><strong>New Haven and Connecticut Shores</strong></p> +<p>Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New Jersey</strong></p> +<p>Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>Upstate New York</strong></p> +<p>Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New York City (Manhattan)</strong></p> +<p>Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>John F. Kennedy International Airport</strong></p> +<p>North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>LaGuardia Airport</strong></p> +<p>East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.</p> +<p><strong>Newark International Airport</strong></p> +<p>North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.</p> +<p><strong>Stewart International Airport</strong></p> +<p>Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.</p> +<p><strong>Westchester County Airport</strong></p> +<p>Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.</p> +<p><strong>Public Transportation</strong></p> +<p>Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.</p> + + + + + Elements of Instance Matching Benchmarks: a Short Overview + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + Tue, 16 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + <p>The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using <strong>instance matching</strong> techniques and tools. Instance matching is also known as <strong>record linkage</strong> <a href="#references">[1]</a>, <strong>duplicate detection</strong> <a href="#references">[2]</a>, <strong>entity resolution</strong> <a href="#references">[3]</a> and <strong>object identification</strong> <a href="#references">[4]</a>.</p> +<p>For instance, a search in Geonames (<a href="http://www.geonames.org/">http://www.geonames.org/</a>) for &ldquo;Athens&rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (<a href="http://dbpedia.org/">http://dbpedia.org/</a>) or Open Government Datasets (<a href="http://data.gov.gr/">http://data.gov.gr/</a>). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.</p> +<p>Web resources are published by &ldquo;autonomous agents&rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years <a href="#references">[2]</a><a href="#references">[5]</a>.</p> +<p>It is though essential at this point to develop, along with instance and entity matching systems, <em>instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs</em>. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.</p> +<p>An instance matching benchmark for Linked Data consists of a <em>source</em> and <em>target dataset</em> implementing a set of <em>test-cases</em>, where each test case addresses a different kind of requirement regarding instance matching, a <em>ground truth</em> or <em>gold standard</em> and finally the <em>evaluation metrics</em> used to <em>assess the benchmark.</em></p> +<p>Datasets are the raw material of a benchmark. A benchmark comprises of a <em>source</em> and <em>target</em> dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their <em>nature</em> (<em>real</em> or <em>synthetic</em>), (b) the <em>schemas/ontologies</em> they use, (c) their <em>domains</em>, (d) the <em>languages</em> they are written in, and (e) the <em>variations/heterogeneities</em> of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. <em>Synthetic datasets</em> are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner</p> +<p>Datasets (and benchmarks) may contain different <em>kinds of variations</em> that correspond to <em>different test cases</em>. According to Ferrara et.al. <a href="#references">[6]</a><a href="#references">[7]</a>, three kinds of variations exist for Linked Data, namely <em>data variations</em>, <em>structural variations</em> and <em>logical variations</em>. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.</p> +<p>The common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.</p> +<p>The <em>gold standard</em> is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.</p> +<p>Last, an instance matching benchmark uses <em>evaluation metrics</em> to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard <em>precision</em>, <em>recall</em> and <em>f-measure</em> metrics.</p> +<h4 id="references">References</h4> +<p>[1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.</p> +<p>[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).</p> +<p>[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.</p> +<p>[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.</p> +<p>[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.</p> +<p>[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).</p> +<p>[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + Wed, 10 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + <p>In this post we will look at running the <a href="https://ldbcouncil.org/developer/snb">LDBC SNB</a> on <a href="https://virtuoso.openlinksw.com/">Virtuoso</a>.</p> +<p>First, let&rsquo;s recap what the benchmark is about:</p> +<ol> +<li> +<p>fairly frequent short updates, with no update contention worth mentioning</p> +</li> +<li> +<p>short random lookups</p> +</li> +<li> +<p>medium complex queries centered around a person&rsquo;s social environment</p> +</li> +</ol> +<p>The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.</p> +<p>The DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, <em>per se,</em> since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.</p> +<p>The workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.</p> +<p>The test system is the same as used in the <a href="http://www.openlinksw.com/weblog/oerling/?id=1739">TPC-H series</a>: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics branch</a> of <a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack, available from www.github.com</a>.</p> +<p>The dataset is the SNB 300G set, with:</p> +<table> +<thead> +<tr> +<th>1,136,127</th> +<th>persons</th> +</tr> +</thead> +<tbody> +<tr> +<td>125,249,604</td> +<td>knows edges</td> +</tr> +<tr> +<td>847,886,644</td> +<td>posts, including replies</td> +</tr> +<tr> +<td>1,145,893,841</td> +<td>tags of posts or replies</td> +</tr> +<tr> +<td>1,140,226,235</td> +<td>likes of posts or replies</td> +</tr> +</tbody> +</table> +<p>As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.</p> +<p>Below are the numerical quantities for a 400K operation run after 150K operations worth of warmup.</p> +<p><strong>Duration:</strong> 10:41.251<br> +<strong>Throughput:</strong> 623.71 (op/s)</p> +<p>The statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.</p> +<table> +<thead> +<tr> +<th>% of total</th> +<th>total_wait</th> +<th>name</th> +<th>count</th> +<th>mean</th> +<th>min</th> +<th>max</th> +</tr> +</thead> +<tbody> +<tr> +<td>20%</td> +<td>4,231,130</td> +<td>LdbcQuery5</td> +<td>656</td> +<td>6,449.89</td> +<td>245</td> +<td>10,311</td> +</tr> +<tr> +<td>11%</td> +<td>2,272,954</td> +<td>LdbcQuery8</td> +<td>18,354</td> +<td>123.84</td> +<td>14</td> +<td>2,240</td> +</tr> +<tr> +<td>10%</td> +<td>2,200,718</td> +<td>LdbcQuery3</td> +<td>388</td> +<td>5,671.95</td> +<td>468</td> +<td>17,368</td> +</tr> +<tr> +<td>7.3%</td> +<td>1,561,382</td> +<td>LdbcQuery14</td> +<td>1,124</td> +<td>1,389.13</td> +<td>4</td> +<td>5,724</td> +</tr> +<tr> +<td>6.7%</td> +<td>1,441,575</td> +<td>LdbcQuery12</td> +<td>1,252</td> +<td>1,151.42</td> +<td>15</td> +<td>3,273</td> +</tr> +<tr> +<td>6.5%</td> +<td>1,396,932</td> +<td>LdbcQuery10</td> +<td>1,252</td> +<td>1,115.76</td> +<td>13</td> +<td>4,743</td> +</tr> +<tr> +<td>5%</td> +<td>1,064,457</td> +<td>LdbcShortQuery3PersonFriends</td> +<td>46,285</td> +<td>22.9979</td> +<td>0</td> +<td>2,287</td> +</tr> +<tr> +<td>4.9%</td> +<td>1,047,536</td> +<td>LdbcShortQuery2PersonPosts</td> +<td>46,285</td> +<td>22.6323</td> +<td>0</td> +<td>2,156</td> +</tr> +<tr> +<td>4.1%</td> +<td>885,102</td> +<td>LdbcQuery6</td> +<td>1,721</td> +<td>514.295</td> +<td>8</td> +<td>5,227</td> +</tr> +<tr> +<td>3.3%</td> +<td>707,901</td> +<td>LdbcQuery1</td> +<td>2,117</td> +<td>334.389</td> +<td>28</td> +<td>3,467</td> +</tr> +<tr> +<td>2.4%</td> +<td>521,738</td> +<td>LdbcQuery4</td> +<td>1,530</td> +<td>341.005</td> +<td>49</td> +<td>2,774</td> +</tr> +<tr> +<td>2.1%</td> +<td>440,197</td> +<td>LdbcShortQuery4MessageContent</td> +<td>46,302</td> +<td>9.50708</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.9%</td> +<td>407,450</td> +<td>LdbcUpdate5AddForumMembership</td> +<td>14,338</td> +<td>28.4175</td> +<td>0</td> +<td>2,008</td> +</tr> +<tr> +<td>1.9%</td> +<td>405,243</td> +<td>LdbcShortQuery7MessageReplies</td> +<td>46,302</td> +<td>8.75217</td> +<td>0</td> +<td>2,112</td> +</tr> +<tr> +<td>1.9%</td> +<td>404,002</td> +<td>LdbcShortQuery6MessageForum</td> +<td>46,302</td> +<td>8.72537</td> +<td>0</td> +<td>1,968</td> +</tr> +<tr> +<td>1.8%</td> +<td>387,044</td> +<td>LdbcUpdate3AddCommentLike</td> +<td>12,659</td> +<td>30.5746</td> +<td>0</td> +<td>2,060</td> +</tr> +<tr> +<td>1.7%</td> +<td>361,290</td> +<td>LdbcShortQuery1PersonProfile</td> +<td>46,285</td> +<td>7.80577</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.6%</td> +<td>334,409</td> +<td>LdbcShortQuery5MessageCreator</td> +<td>46,302</td> +<td>7.22234</td> +<td>0</td> +<td>2,055</td> +</tr> +<tr> +<td>1%</td> +<td>220,740</td> +<td>LdbcQuery2</td> +<td>1,488</td> +<td>148.347</td> +<td>2</td> +<td>2,504</td> +</tr> +<tr> +<td>0.96%</td> +<td>205,910</td> +<td>LdbcQuery7</td> +<td>1,721</td> +<td>119.646</td> +<td>11</td> +<td>2,295</td> +</tr> +<tr> +<td>0.93%</td> +<td>198,971</td> +<td>LdbcUpdate2AddPostLike</td> +<td>5,974</td> +<td>33.3062</td> +<td>0</td> +<td>1,987</td> +</tr> +<tr> +<td>0.88%</td> +<td>189,871</td> +<td>LdbcQuery11</td> +<td>2,294</td> +<td>82.7685</td> +<td>4</td> +<td>2,219</td> +</tr> +<tr> +<td>0.85%</td> +<td>182,964</td> +<td>LdbcQuery13</td> +<td>2,898</td> +<td>63.1346</td> +<td>1</td> +<td>2,201</td> +</tr> +<tr> +<td>0.74%</td> +<td>158,188</td> +<td>LdbcQuery9</td> +<td>78</td> +<td>2,028.05</td> +<td>1,108</td> +<td>4,183</td> +</tr> +<tr> +<td>0.67%</td> +<td>143,457</td> +<td>LdbcUpdate7AddComment</td> +<td>3,986</td> +<td>35.9902</td> +<td>1</td> +<td>1,912</td> +</tr> +<tr> +<td>0.26%</td> +<td>54,947</td> +<td>LdbcUpdate8AddFriendship</td> +<td>571</td> +<td>96.2294</td> +<td>1</td> +<td>988</td> +</tr> +<tr> +<td>0.2%</td> +<td>43,451</td> +<td>LdbcUpdate6AddPost</td> +<td>1,386</td> +<td>31.3499</td> +<td>1</td> +<td>2,060</td> +</tr> +<tr> +<td>0.01%</td> +<td>1,848</td> +<td>LdbcUpdate4AddForum</td> +<td>103</td> +<td>17.9417</td> +<td>1</td> +<td>65</td> +</tr> +<tr> +<td>0.00%</td> +<td>44</td> +<td>LdbcUpdate1AddPerson</td> +<td>2</td> +<td>22</td> +<td>10</td> +<td>34</td> +</tr> +</tbody> +</table> +<p>At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.</p> +<p>The implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.</p> +<p>The set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:</p> +<ul> +<li> +<p><em>Cardinality estimation under heavy data skew —</em> Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.</p> +</li> +<li> +<p><em>Covering indices —</em> Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post&rsquo;s creation date.</p> +</li> +<li> +<p><em>Multi-hop graph traversal —</em> Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.</p> +</li> +<li> +<p><em>Top <em>K</em> —</em> Most queries returning posts order results by descending date. Once there are at least <em>k</em> results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top <em>k</em>.</p> +</li> +<li> +<p><em>Late projection —</em> Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.</p> +</li> +<li> +<p><em>Materialization —</em> Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.</p> +</li> +<li> +<p><em>Concurrency control —</em> Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.</p> +</li> +</ul> +<p>In subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + SNB and Graphs Related Presentations at GRADES '15 + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + Fri, 29 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + <p>Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.</p> +<p>Among the papers published in this edition we have &ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms&rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in <a href="https://github.com/ldbc">https://github.com/ldbc</a>) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have &ldquo;Microblogging Queries on Graph Databases: an Introspection&rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention &ldquo;Frappé: Querying the Linux Kernel Dependency Graph&rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.</p> +<p><a href="http://event.cwi.nl/grades2015/program.shtml">Check the complete agenda.</a></p> +<p>Meet you in Melbourne!</p> + + + + + SNB Interactive Part 2: Modeling Choices + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + Tue, 26 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + <p><a href="https://ldbcouncil.org/benchmarks/snb">​SNB Interactive</a> is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.</p> +<p>In the case of <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server">Virtuoso</a>, we have played with <a href="http://dbpedia.org/resource/SQL">SQL</a> and <a href="http://dbpedia.org/resource/SPARQL">SPARQL</a> implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.</p> +<h3 id="schema-design">Schema Design</h3> +<p>SNB has a regular schema described by a <a href="https://en.wikipedia.org/wiki/Unified_Modeling_Language">UML</a> diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.</p> +<p>The only table-level choice has to do with whether <code>posts</code> and <code>comments</code> are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.</p> +<p>The workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key <code>of ps_creatorid</code>, <code>ps_creationdate</code>, <code>ps_postid</code> pays off since the top-k on <code>creationdate</code> can be pushed down into the index without needing a reference to the table.</p> +<p>The implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the <a href="http://dbpedia.org/resource/DEX_(Graph_database)">Sparksee</a> and <a href="http://dbpedia.org/resource/Neo4j">Neo4J</a> implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.</p> +<p>The benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 &ldquo;top of the wall&rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. <code>person1</code>, <code>person2</code> -&gt; <code>weight</code>. <code>Person1</code> is by convention the one with the smaller <code>p_personid</code>. Note that comparing id&rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI&rsquo;s with disastrous performance implications unless an implementation specific trick were used.</p> +<p>In the next installment we will analyze an actual run.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + Mon, 25 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + <p>LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.</p> +<p>On the industry track, LDBC will be presenting the <em>Social Network Benchmark Interactive Workload</em> by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).</p> +<p>You can read more about the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark here</a> and collaborate if you&rsquo;re interested!</p> +<p>The other presentation will be at the GRADES workshop within the SIGMOD program regarding <em>Graphalytics: A Big Data Benchmark for Graph-Processing platforms</em> by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.</p> +<p>Don&rsquo;t forget to check our presentations if you&rsquo;re attending the SIGMOD!</p> + + + + + SNB Interactive Part 1: What Is SNB Interactive Really About? + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + Thu, 14 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + <p>This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.</p> +<p>With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.</p> +<p>The essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.</p> +<p>So far, we see that SNB confronts the implementor with choices in the following areas:</p> +<ul> +<li>Data model: Relational, RF, property graph?</li> +<li>Physical model, e.g. row-wise vs. column wise storage</li> +<li>Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures</li> +<li>Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers</li> +<li>Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc.</li> +<li>Parameters vs. literals: Sometimes different parameter values result in different optimal query plans</li> +<li>Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload</li> +<li>Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing.</li> +</ul> +<p>In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.</p> +<ul> +<li>Data generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option.</li> +<li>Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce?</li> +<li>Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a &lsquo;fast&rsquo; and &lsquo;slow&rsquo; case of a single query template. How long does one need to run to balance these fluctuations?</li> +<li>Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput.</li> +<li>Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse.</li> +<li>Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable.</li> +<li>Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy?</li> +<li>Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters?</li> +</ul> +<p>The following posts will look at the above in light of actual experience.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + Why Do We Need an LDBC SNB-Specific Workload Driver? + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + Tue, 21 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + <p>In a previous <a href="https://ldbcouncil.org/tags/driver">3-part blog series</a> we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn&rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - &ldquo;graph&rdquo; is the word that best captures the novelty and difficulty of this work.</p> +<p><strong>Workload Execution - Traditional vs Graph</strong></p> +<p>Transactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.</p> +<p>To understand what is meant by &ldquo;traditional relational workloads&rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, &ldquo;dependency&rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator&rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order <em>for a given user</em>. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.</p> +<p>A significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user&rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.</p> +<p>Such scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend&rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. <em>Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.</em></p> +<p><strong>Because it&rsquo;s a graph</strong></p> +<p>In short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.</p> + + + + + Event Driven Post Generation in Datagen + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + Fri, 10 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + <p>As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.</p> +<p>First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:</p> +<ul> +<li> +<p>The wall of the person</p> +</li> +<li> +<p>The albums of the person</p> +</li> +<li> +<p>The groups where the person is a moderator</p> +</li> +</ul> +<p>We will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.</p> +<p>After assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.</p> +<p>Flashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to &ldquo;Enrique Iglesias&rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.</p> +<p>Once the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:</p> +<ul> +<li> +<p>Determine the number of posts to generate</p> +</li> +<li> +<p>Select a random member of the group that will generate the post</p> +</li> +<li> +<p>Determine the event the post will be related to given the aforementioned cumulative distribution</p> +</li> +<li> +<p>Assign the date of the post based on the event date</p> +</li> +</ul> +<p>In order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from <a href="#references">[1]</a>. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.</p> +<p><img src="index.png" alt=""></p> +<p>Following the example of &ldquo;Enrique Iglesias&rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.</p> +<p><img src="index2.png" alt=""></p> +<p>In this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.</p> +<h4 id="references">References</h4> +<p>[1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506</p> + + + + + Sixth TUC Meeting + https://ldbcouncil.org/event/sixth-tuc-meeting/ + Thu, 19 Mar 2015 13:53:33 -0400 + + https://ldbcouncil.org/event/sixth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on <strong>Thursday and Friday March 19/20, 2015.</strong></p> +<p>The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the first benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.</li> +<li>Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>.</p> +<h3 id="agenda">Agenda</h3> +<p><strong>Thursday 19th March</strong></p> +<p>11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)</p> +<p>11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981131.pdf">slides</a></p> +<p>12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)</p> +<p>12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981137.pdf">slides</a></p> +<p>12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain</p> +<p>12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive</p> +<p>13:10 Claudio Martella (VUA): Giraph and Lighthouse</p> +<p>13:30 - 14:30 Lunch break</p> +<p>14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)</p> +<p>14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981132.pdf">slides</a></p> +<p>14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981133.pdf">slides</a></p> +<p>15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981139.pdf">slides</a></p> +<p>15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs</p> +<p>18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.</p> +<p>20:00 Social dinner at <a href="http://www.bastaix.com">Bastaix Restaurant</a>.</p> +<p><strong>Friday 20th March</strong></p> +<p>9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)</p> +<p>9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics</p> +<p>9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms</p> +<p>10:10 John Snelson (MarkLogic): Introduction to MarkLogic</p> +<p>10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload</p> +<p>10:50 Moritz Kaufmann. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/moritz-kaufmann-ldbc-snb-benchmark-auditing-6th-ldbc-tuc.pdf">The auditing experience</a></p> +<p>11:15 - 11:45 Coffee break</p> +<p>11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)</p> +<p>11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox</p> +<p>12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data</p> +<p>12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments</p> +<p>12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981138.pdf">slides</a></p> +<p>13:30 - 14:30 Lunch break</p> +<p>15:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>19th and 20th March 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held at &ldquo;Aula Master&rdquo; at A3 building located inside the &ldquo;Campus Nord UPC&rdquo; in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h5 id="maps-and-situation"><strong>Maps and situation</strong></h5> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<h5 id="finding-upc"><strong>Finding UPC</strong></h5> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<h5 id="finding-the-meeting-room"><strong>Finding the meeting room</strong></h5> +<h5 id="getting-there">Getting there</h5> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to<br> +the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<h5 id="the-locations-of-the-airport-and-the-city-centre"><strong>The locations of the airport and the city centre</strong></h5> + + + + + The LDBC Datagen Community Structure + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + Sun, 15 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + <p>This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.</p> +<p>When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.</p> +<p>The first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 <a href="#references">[1]</a>. Here we summarize the paper and its contributions and findings.</p> +<p>Existing synthetic graph generators such as Rmat <a href="#references">[1]</a> and Mag <a href="#references">[2]</a>, are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR <a href="#references">[3]</a>, a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.</p> +<p>However, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of <a href="#references">[1]</a> set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to <a href="#references">[4]</a>.</p> +<table> +<thead> +<tr> +<th></th> +<th><em>Nodes</em></th> +<th><em>Edges</em></th> +</tr> +</thead> +<tbody> +<tr> +<td><em>Amazon</em></td> +<td>334863</td> +<td>925872</td> +</tr> +<tr> +<td><em>Dblp</em></td> +<td>317080</td> +<td>1049866</td> +</tr> +<tr> +<td><em>Youtube</em></td> +<td>1134890</td> +<td>2987624</td> +</tr> +<tr> +<td><em>Livejournal</em></td> +<td>3997962</td> +<td>34681189</td> +</tr> +</tbody> +</table> +<p>The authors of <a href="#references">[1]</a> selected a set of statistical indicators to<br> +characterize the communities:</p> +<ul> +<li>The clustering coefficient</li> +<li>The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community.</li> +<li>The bridge ratio, which is the ratio of edges whose removal disconnects the community.</li> +<li>The diameter</li> +<li>The conductance</li> +<li>The size</li> +</ul> +<p>The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.</p> +<ul> +<li>Many of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community.</li> +<li>Most of the communities are small (10 or less nodes).</li> +<li>In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index.png" alt=""></td> +<td style="text-align:center"><img src="index2.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index3.png" alt=""></td> +<td style="text-align:center"><img src="index4.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">Diameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index5.png" alt=""></td> +<td style="text-align:center"><img src="index6.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index8.png" alt=""></td> +<td style="text-align:center"><img src="index9.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index10.png" alt=""></td> +<td style="text-align:center"><img src="index11.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index11.png" alt=""></td> +<td style="text-align:center"><img src="index12.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The main conclusions that can be extracted from DATAGEN can be summarized asfollows:</p> +<ul> +<li>DATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio.</li> +<li>The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs.</li> +<li>Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger.</li> +<li>The diameter is small like in the real graphs.</li> +<li>It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics.</li> +</ul> +<p>Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:</p> +<ul> +<li>LFR graphs donot show the multimodal distribution observed in real graphs</li> +<li>Only the diameter shows a similar shape as in the ground truth.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index13.png" alt=""></td> +<td style="text-align:center"><img src="index14.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index15.png" alt=""></td> +<td style="text-align:center"><img src="index16.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index17.png" alt=""></td> +<td style="text-align:center"><img src="index18.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman&rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index19.png" alt=""></td> +<td style="text-align:center"><img src="index20.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index21.png" alt=""></td> +<td style="text-align:center"><img src="index22.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index23.png" alt=""></td> +<td style="text-align:center"><img src="index24.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!</p> +<h4 id="references">References</h4> +<p>[1] Arnau Prat-Pérez, <a href="http://dblp.uni-trier.de/pers/hd/d/Dom=iacute=nguez=Sal:David">David Domínguez-Sal</a>: How community-like is the structure of synthetically generated graphs? <a href="http://dblp.uni-trier.de/db/conf/sigmod/grades2014.html#PratD14">GRADES 2014</a></p> +<p>[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014</p> +<p>[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics</p> +<p>[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.</p> + + + + + Industry Relevance of the Semantic Publishing Benchmark + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + Tue, 03 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + <h3 id="publishing-and-media-businesses-are-going-through-transformation">Publishing and media businesses are going through transformation</h3> +<p>I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.</p> +<p>What’s driving this change in publishing and media?</p> +<ul> +<li> +<p>Widespread and instantaneous distribution of information over the Internet has turned news into somewhat of a &ldquo;commodity&rdquo; and few people are willing to pay for it</p> +</li> +<li> +<p>The wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;</p> +</li> +<li> +<p>Open access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.</p> +</li> +</ul> +<p><em>Alongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.</em></p> +<h3 id="imagine-instant-news-in-context-imagine-personal-channels-imagine--triplestores">Imagine instant news in context, Imagine personal channels, Imagine &hellip; triplestores</h3> +<p>While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in &ldquo;near real time&rdquo; is difficult and expensive using legacy processes and content management technology.</p> +<p>Another example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like <a href="http://new.dowjones.com/products/factiva/">Factiva</a>, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.</p> +<p>Finally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.</p> +<p>Many publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:</p> +<ol> +<li>Generation of rich and &ldquo;meaningful&rdquo; (trying not to use &ldquo;semantic&rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery.</li> +</ol> +<p>In this post I write about &ldquo;semantic annotation&rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.</p> +<h3 id="semantic-annotation-produces-rich-metadata-descriptions--the-fuel-for-semantic-publishing">Semantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing</h3> +<p>The most popular meaning of &ldquo;semantic annotation&rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.</p> +<p><img src="02_semantic_repository.png" alt=""></p> +<p>The concept of using <a href="http://infosys3.elfak.ni.ac.rs/nastava/attach/SemantickiWebKurs/sdarticle.pdf">text-mining for automatic semantic annotation</a> of text with respect to very large datasets, such as <a href="http://dbpedia.org/">DBPedia</a>, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether &ldquo;Paris&rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether &ldquo;Hilton&rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.</p> +<p>Today there are plenty of tools (such as the <a href="https://www.ontotext.com/semantic-solutions/media-publishing/">Ontotext Media and Publishing</a> platform and <a href="https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki">DBPedia Spotlight</a>) and services (such as Thomson Reuter’s <a href="http://www.opencalais.com/">OpenCalais</a> and Ontotext’s <a href="http://s4.ontotext.com">S4</a>) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.</p> +<h3 id="the-bbcs-dynamic-semantic-publishing-dsp">The BBC’s Dynamic Semantic Publishing (DSP)</h3> +<p>Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.</p> +<p>BBC Future Media &amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.</p> +<p><em>&ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories&rdquo;.</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">Jem Rayfield, Senior Technical Architect</a>, BBC News and Knowledge</p> +<p>The Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (&ldquo;edited by exception&rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.</p> +<p><em>&ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#language">natural language and ontological determiner process</a> automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.</em></p> +<p><img src="03_bbc_sport.png" alt=""></p> +<p><em>Journalist-published metadata is captured and made persistent for querying using the resource description framework (<a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#RDF"><em>RDF</em></a>) metadata representation and triple store technology. <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#BigOWLIM">A RDF triplestore</a> and <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#SPARQL">SPARQL</a> approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept &ldquo;Frank Lampard&rdquo;, then the framework infers and applies concepts such as &ldquo;England Squad&rdquo;, &ldquo;Group C&rdquo; and &ldquo;FIFA World Cup 2010&rdquo; &hellip;&rdquo;</em> &ndash; Jem Rayfield</p> +<p>One can consider each of the &ldquo;aggregation pages&rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.</p> +<p><em>&ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content</em></p> +<p><strong>…</strong><strong><em>we are not publishing pages, but publishing content</em></strong> <em>as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.</em></p> +<p><img src="04_content_tagging.png" alt=""></p> +<p><em>… The index pages are published automatically. This process is what assures us of the highest quality output, but still <strong>save large amounts of time</strong> in managing the site and <strong>makes it possible for us to efficiently run so many pages</strong> for the World Cup.&rdquo;</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/the_world_cup_and_a_call_to_ac.html">John O&rsquo;Donovan, Chief Technical Architect, BBC Future Media &amp; Technology</a></p> +<p>To get a real feeling about the load of the triplestore behind BBC&rsquo;s World Cup web site, here are some statistics:</p> +<ul> +<li> +<p>800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;</p> +</li> +<li> +<p>Average unique page requests/day: 2 million;</p> +</li> +<li> +<p>Average <strong>SPARQL queries/day: 1 million;</strong></p> +</li> +<li> +<p><strong>100s repository updates/inserts per minute</strong> with OWL 2 RL reasoning;</p> +</li> +<li> +<p>Multi data center that is fully resilient, clustered 6 node triplestore.</p> +</li> +</ul> +<h3 id="the-semantic-publishing-benchmark">The Semantic Publishing Benchmark</h3> +<p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).</p> +<p>SPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:</p> +<ul> +<li> +<p><em>Reference knowledge:</em> taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);</p> +</li> +<li> +<p><em>Metadata</em> for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.</p> +</li> +</ul> +<p>In this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:</p> +<ul> +<li> +<p><em>Aggregation queries</em> retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;</p> +</li> +<li> +<p><em>Updates</em>, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.</p> +</li> +</ul> +<p>SPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from <a href="http://www.geonames.org/">Geonames</a> for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.</p> +<p>A more technical introduction to SPB can be found in this <a href="https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark">post</a>. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">post</a>. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.</p> +<p>Despite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of &ldquo;fast flowing&rdquo; content need to be &ldquo;dispatched&rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:</p> +<ul> +<li> +<p>The Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;</p> +</li> +<li> +<p>Reasoning is needed to map content descriptions to queries in a flexible manner;</p> +</li> +<li> +<p>There are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.</p> +</li> +</ul> +<h3 id="spb-v20--steeper-for-the-engines-closer-to-the-publishers">SPB v.2.0 – steeper for the engines, closer to the publishers</h3> +<p>We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.</p> +<p>The major changes in SPB v.2.0 can be summarized as follows:</p> +<ul> +<li> +<p>Much bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;</p> +</li> +<li> +<p>Better interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;</p> +</li> +<li> +<p>Retrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.</p> +</li> +</ul> + + + + + OWL-Empowered SPARQL Query Optimization + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + Wed, 18 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + <p>The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.</p> +<p>In this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.</p> +<p>OWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.</p> +<p>This richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.</p> +<p>The aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found <a href="LDBC_D4.4.2_final.pdf">here</a>.</p> +<h3 id="schema-based-optimization-techniques">Schema-Based Optimization Techniques</h3> +<p>Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.</p> +<p>A simple first case is the case of constraint violation. Consider the query below, which returns all instances of class <code>&lt;A&gt;</code> which are fillers of a specific property <code>&lt;P&gt;</code>. If the underlying schema contains the information that the range of <code>&lt;P&gt;</code> is class <code>&lt;B&gt;</code>, and that class <code>&lt;B&gt;</code> is disjoint from class <code>&lt;A&gt;</code>, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.</p> +<pre tabindex="0"><code>SELECT ?v +WHERE { ?v rdf : type &lt;A&gt; . + ?u &lt;P&gt; ?v . ?u &lt;P&gt; ?v1 . + ?u &lt;P1 &gt; ?v2 . ?u &lt;P2 &gt; ?v3 . + ?u &lt;P3 &gt; ?v4 . ?u &lt;P4 &gt; ?v5} +</code></pre><p>Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class <code>&lt;A1&gt;</code> can immediately be identified as not being in the answer set.</p> +<pre tabindex="0"><code>SELECT ?c +WHERE { ?x rdf: type ?c . ?x &lt;P&gt; ?y . + FILTER NOT EXISTS \{ ?x rdf: type &lt;A1 &gt; }} +</code></pre><p>Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.</p> +<p>Similarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).</p> +<p>As an example of these patterns, consider the query below, where class <code>&lt;C&gt;</code> is defined as the intersection of classes <code>&lt;C1&gt;</code>,<code> &lt;C2&gt;</code>. Thus, the triple pattern <code>(?x rdf:type &lt;C&gt;)</code> is more selective than <code>(?y rdf:type &lt;C1&gt;)</code> and <code>(?z rdf:type &lt;C2&gt;)</code> and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.</p> +<pre tabindex="0"><code>SELECT ?x +WHERE { ?x rdf: type &lt;C&gt; . ?x &lt;P1 &gt; ?y . + ?y rdf : type &lt;C1 &gt; . ?y &lt;P2 &gt; ?z . ?z rdf : type &lt;C2 &gt; } +</code></pre><p><img src="owl_constraints.png" alt="image"></p> +<p>Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property <code>P4</code> is a symmetric property.</p> +<pre tabindex="0"><code>SELECT ?y ?y1 ?y2 ?y3 +WHERE { ?x &lt;P1 &gt; ?y . ?x &lt;P2 &gt; ?y1 . + ?x &lt;P3 &gt; ?y2 . ?y3 &lt;P4 &gt; ?x } +</code></pre><h3 id="conclusion">Conclusion</h3> +<p>In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:</p> +<ul> +<li>Cases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results.</li> +<li>Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing.</li> +<li>Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query.</li> +<li>Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques.</li> +</ul> +<p>This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.</p> + + + + + Person Activity Subgraph Features in LDBC DATAGEN + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + Wed, 04 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + <p>When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.</p> +<p>When looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.</p> +<h3 id="reaslistic-message-content">Reaslistic Message Content</h3> +<p>Messages&rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post&rsquo;s tags to other related or randomly selected tags.</p> +<h3 id="non-uniform-activity-levels">Non uniform activity levels</h3> +<p>In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.</p> +<h3 id="time-correlated-post-and-comment-generation">Time correlated post and comment generation</h3> +<p>In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in <a href="#references">[1]</a>. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.</p> +<p><img src="1.png" alt="image"></p> +<p>As we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.</p> +<p>In this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.</p> +<h4 id="references">References</h4> +<p>[1] Leskovec, J., Backstrom, L., &amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In <em>Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 497-506). ACM.</p> + + + + + SNB Driver - Part 2: Tracking Dependencies Between Queries + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + Fri, 23 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + <p>The <a href="https://ldbcouncil.org/post/snb-driver-part-1">SNB Driver part 1</a> post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we&rsquo;ll drill down deeper into the details of what it means to execute &ldquo;dependent queries&rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.</p> +<h3 id="definitions">Definitions</h3> +<ul> +<li> +<p><em>Simulation Time (ST)</em>: notion of time created by data generator. All time stamps in the generated data set are in simulation time</p> +</li> +<li> +<p><em>Real Time (RT)</em>: wall clock time</p> +</li> +<li> +<p><em>Time Compression Ratio</em>: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark</p> +</li> +<li> +<p><em>Operation</em>: read and/or write</p> +</li> +<li> +<p><em>Dependencies</em>: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed</p> +</li> +<li> +<p><em>Dependents</em>: operations in this set are dependent on at least one other operation (in Dependencies) in the workload</p> +</li> +<li> +<p><em>Due Time (DueT)</em>: point in simulation time at which the execution of an operation should be initiated.</p> +</li> +<li> +<p><em>Dependent Time (DepT)</em>: in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.</p> +</li> +<li> +<p><em>Safe Time (SafeT)</em>: time duration.</p> +<ul> +<li> +<p>when two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them</p> +</li> +<li> +<p>SafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents</p> +</li> +</ul> +</li> +<li> +<p>​<em>Operation Stream</em>: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)</p> +</li> +<li> +<p><em>Initiated Operations</em>: operations that have started executing but not yet finished</p> +</li> +<li> +<p><em>Local Completion Time (per driver)</em>: point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))</p> +</li> +<li> +<p><em>Global Completion Time (GCT)</em>: minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​</p> +</li> +<li> +<p><em>Execution Window (Window)</em>: a timespan within which all operations can be safely executed</p> +<ul> +<li> +<p>All operations satisfying window.startTime &lt;= operation.DueT &lt; window.endTime may be executed</p> +</li> +<li> +<p>Within a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window</p> +</li> +<li> +<p>To ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 &lt; window.duration &lt;= SafeT</p> +</li> +<li> +<p>Window duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable</p> +</li> +<li> +<p>Before any operations within a window can start executing it is required that: GCT &gt;= window.startTime - (SafeT - window.duration)</p> +</li> +<li> +<p>All operations within a window must initiate and complete between window start and end times: window.startTime &lt;= operation.initiate &lt; window.endTime and window.startTime &lt;= operation.complete &lt; window.endTime</p> +</li> +</ul> +</li> +<li> +<p><em>Dependency Mode</em>: defines dependencies, constraints on operation execution order</p> +</li> +<li> +<p><em>Execution Mode</em>: defines how the runtime should execute operations of a given type</p> +</li> +</ul> +<h3 id="tracking-dependencies">Tracking Dependencies</h3> +<p>Now, the fun part, making sure dependent operations are executed in the correct order.</p> +<p>Consider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).</p> +<p>Logically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:</p> +<ul> +<li>the set of operations that have started executing but not yet finished.</li> +</ul> +<p>Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:</p> +<ul> +<li>the set of operations that have started and finished executing.</li> +</ul> +<p>Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):</p> +<ul> +<li>the point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time.</li> +</ul> +<p>LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:</p> +<ul> +<li> +<p><em>Due Time</em>: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled</p> +</li> +<li> +<p><em>GCT</em>: every operation (from Dependencies) with a Due Time before this point in time has completed execution</p> +</li> +</ul> +<p>However, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:</p> +<ul> +<li>in addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute.</li> +</ul> +<p>Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.</p> +<h3 id="scalable-execution-in-the-presence-of-dependencies">Scalable execution in the Presence of Dependencies</h3> +<p>The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:</p> +<p>a) make the generated load less &lsquo;bursty&rsquo;</p> +<p>b) allow the driver to &lsquo;scale&rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.</p> +<p>In the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.</p> +<p>The reasons this approach is correct are as follows:</p> +<ul> +<li> +<p>Operations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked</p> +</li> +<li> +<p>The minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)</p> +</li> +<li> +<p>A window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until</p> +<p>GCT &gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT</p> +</li> +</ul> +<p>The advantages of such an execution mode are as follows:</p> +<ul> +<li> +<p>As no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window</p> +</li> +<li> +<p>Then, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic</p> +</li> +<li> +<p>Further, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to &lsquo;smoothen&rsquo; the load within a Window.</p> +</li> +</ul> +<p>As with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between &lsquo;workload resolution&rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the &lsquo;bursty&rsquo; part of the event stream.</p> +<p><img src="window-scheduling.png" alt="image"><br> +Figure 1. Window scheduling</p> +<p>This design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - &lsquo;what happens in the window stays in the window&rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.</p> + + + + + SNB Driver - Part 3: Workload Execution Putting It All Together + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + Tue, 20 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + <p>Up until now we have introduced the <a href="https://ldbcouncil.org/post/snb-driver-part-1">challenges faced when executing the LDBC SNB benchmark</a>, as well as explained <a href="https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries">how some of these are overcome</a>. With the foundations laid, we can now explain precisely how operations are executed.</p> +<p>Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.</p> +<h3 id="dependency-modes">Dependency Modes</h3> +<p>While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.</p> +<p>Another way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:</p> +<p><strong>• None</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>– Prior Execution: do nothing – After Execution: do nothing</p> +<p><strong>• Read Only</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: wait for GCT &gt;= operation.DepTime – After Execution: do nothing</p> +<p><strong>• Write Only</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<p><strong>• Read Write</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations, wait for GCT &lt; operation.DepT</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<h3 id="execution-modes">Execution Modes</h3> +<p>Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.</p> +<p><strong>• Asynchronous</strong>: operations are executed individually, when their Due Time arrives.</p> +<p>Motivation: This is the default execution mode, it executes operations as true to the workload definition as possible.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: unbounded</p> +<p>– Max Execution Time: unbounded</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay</p> +<p><strong>• Synchronous</strong>: operations are executed individually, sequentially, in blocking manner.</p> +<p>Motivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.</p> +<p>The alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler</p> +<p>– Execute When time &gt;= operation.DueT and previousOperation.completed == true (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: 1</p> +<p>– Max Execution Time: nextOperation.DueT - operation.DueT</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay</p> +<p><strong>• Partially Synchronous</strong> (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together</p> +<p>– Re-scheduling Before Execution: Yes, as long as the following still holds:</p> +<p>window.startTime &lt;= operation.DueT &lt; window.startTime + window.duration</p> +<p>Operations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified</p> +<p>– Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: number of operations within window</p> +<p>– Max Execution Time: (window.startTime + window.duration) - operation.DueT</p> +<p>– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration</p> +<h3 id="tying-it-back-to-ldbc-snb">Tying it back to LDBC SNB</h3> +<p>The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.</p> +<p>The LDBC SNB data can in fact be seen as a union of two parts:</p> +<ol> +<li> +<p>Core Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.</p> +</li> +<li> +<p>User Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the &ldquo;core&rdquo; part are satisfied (i.e., users don&rsquo;t post things before the profiles are created, etc.).</p> +</li> +</ol> +<p>In order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.</p> +<p>This property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.</p> +<p>On the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).</p> + + + + + Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + Tue, 13 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + <p>Until now we have discussed several aspects of the <a href="https://ldbcouncil.org/benchmarks/spb">Semantic Publishing Benchmark (SPB)</a> such as the <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">difference in performance between virtual and real servers configuration</a>, how to choose an <a href="https://ldbcouncil.org/post/making-semantic-publishing-execution-rules">appropriate query mix</a> for a benchmark run and our experience with using SPB in the development process of GraphDB for <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">finding performance issues</a>.</p> +<p>In this post we provide a step-by-step guide on how to run SPB using the <a href="http://rdf4j.org/">Sesame</a> RDF data store on a fresh install of <a href="http://releases.ubuntu.com/14.04.1/">Ubuntu Server 14.04.1</a>. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.</p> +<h3 id="prerequisites">Prerequisites</h3> +<p>We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:</p> +<ul> +<li>Git</li> +<li>Apache Ant 1.8 or higher</li> +<li>OpenJDK 6 or Oracle JDK 6 or higher</li> +<li>Apache Tomcat 7 or higher</li> +</ul> +<p>If you already have these components installed on your machine you can directly proceed to the next section: <em>Installing Sesame</em></p> +<p>Following are sample commands which can be used to install the required software components:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo apt-get install git +</span></span><span style="display:flex;"><span>sudo apt-get install ant +</span></span><span style="display:flex;"><span>sudo apt-get install default-jdk +</span></span><span style="display:flex;"><span>sudo apt-get install tomcat7 +</span></span></code></pre></div><p>Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.</p> +<p>After a successful installation of Apache Tomcat you should be able to get the default splash page <em>“It works”</em> when you open your web browser and enter the following address: http://&lt;your_ip_address&gt;:8080</p> +<h3 id="installing-sesame">Installing Sesame</h3> +<p>We will use current Sesame version 2.7.14. You can download it <a href="http://sourceforge.net/projects/sesame/files/Sesame%202/">here</a> or run following command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>wget <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download&#34;</span> <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> -O openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>Then extract the Sesame tarball:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>tar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>To deploy sesame you have to copy the two war files that are in <em>openrdf-sesame-2.7.14/war</em> to <em>/var/lib/tomcat7/webapps</em></p> +<p>From <em>openrdf-sesame-2.7.14/war</em> you can do it with command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>cp openrdf-*.war &lt;tomcat_install&gt;/webapps +</span></span></code></pre></div><p>Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.</p> +<p>By default the configuration directory is: <em>/usr/share/tomcat7/.aduna</em></p> +<p>Create the directory:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo mkdir /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Then change the ownership:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chown tomcat7 /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>And finally you should give the necessary permissions:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chmod o+rwx /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Now when you go to: http://&lt;your_ip_address&gt;:8080/openrdf-workbench/repositories</p> +<p>You should get a screen like this:</p> +<p><img src="01-Sesame-repo-list.png" alt="image"></p> +<h3 id="setup-spb">Setup SPB</h3> +<p>You can download the SPB code and find brief documentation on GitHub:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm">https://github.com/ldbc/ldbc_spb_bm</a></p> +<p>A detailed documentation is located here:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf">https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf</a></p> +<p>SPB offers many configuration options which control various features of the benchmark e.g.:</p> +<ul> +<li>query mixes</li> +<li>dataset size</li> +<li>loading datasets</li> +<li>number of agents</li> +<li>validating results</li> +<li>test conformance to OWL2-RL ruleset</li> +<li>update rate of agents</li> +</ul> +<p>Here we demonstrate how to generate a dataset and execute a simple test<br> +run with it.</p> +<p>First download the SPB source code from the repository:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>git clone https://github.com/ldbc/ldbc_spb_bm.git +</span></span></code></pre></div><p>Then in the ldbc_spb_bm directory build the project:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>ant build-basic-querymix +</span></span></code></pre></div><p>If you simply execute the command:</p> +<pre tabindex="0"><code>ant +</code></pre><p>you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.</p> +<p>Depending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat&rsquo;s startup files e.g. in <em>catalina.sh</em>:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>export JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;-d64 -Xmx4G&#34;</span> +</span></span></code></pre></div><p>To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:</p> +<p><img src="02-Sesame-create-repo.png" alt="image"></p> +<p>Then we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in <em>ldbc_spb_bm/dist/test.properties</em> file.</p> +<p>The default value of <em>datasetSize</em> in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.</p> +<p>You need to change</p> +<pre tabindex="0"><code>datasetSize=1000000 +</code></pre><p>Also the URLs of the SPARQL endpoint for the repository</p> +<pre tabindex="0"><code>endpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 +endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements +</code></pre><p>First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.</p> +<p>These are the settings to do that, following parameters will &lsquo;instruct&rsquo; the SPB test driver to perform all the actions described above:</p> +<pre tabindex="0"><code>#Benchmark Operational Phases +loadOntologies=true +loadReferenceDatasets=true +generateCreativeWorks=true +loadCreativeWorks=true +generateQuerySubstitutionParameters=true +validateQueryResults=false +warmUp=false +runBenchmark=false +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>To run the benchmark execute the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>java -jar semantic_publishing_benchmark-basic-standard.jar +</span></span><span style="display:flex;"><span>test.properties +</span></span></code></pre></div><p>When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.</p> +<p>Next we will measure the performance of Sesame Data Store by changing some configuration properties:</p> +<pre tabindex="0"><code>#Benchmark Configuration Parameters +warmupPeriodSeconds=60 +benchmarkRunPeriodSeconds=300 +... +#Benchmark Operational Phases +loadOntologies=false +loadReferenceDatasets=false +generateCreativeWorks=false +loadCreativeWorks=false +generateQuerySubstitutionParameters=false +validateQueryResults=false +warmUp=true +runBenchmark=true +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>After the benchmark test run has finished result files are saved in folder: <em>dist/logs</em></p> +<p>There you will find three types of results: the result summary of the benchmark run (<em>semantic_publishing_benchmark_results.log),</em> brief results and detailed results.</p> +<p>In <em>semantic_publishing_benchmark_results.log</em> you will find the results distributed per seconds. They should be similar to the listing bellow:</p> +<p>Benchmark Results for the 300-th second</p> +<pre tabindex="0"><code>Seconds : 300 (completed query mixes : 0) + Editorial: + 2 agents + + 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) + 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) + 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) + + 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) + 0.0300 average operations per second + + Aggregation: + 8 agents + + 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) + 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) + 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) + 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) + 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) + 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) + 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) + 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) + 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) +</code></pre><p>This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.</p> +<p>If you have any troubles running the benchmark, don&rsquo;t hesitate to comment or use our social media channels.</p> +<p>In a future post we will go through some of the parameters of SPB and check their performance implications.</p> + + + + + Semantic Publishing Instance Matching Benchmark + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + Tue, 30 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + <p>The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.</p> +<p>The SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:</p> +<ul> +<li>value-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity</li> +<li>structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation)</li> +</ul> +<p>The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:</p> +<ul> +<li>instance (in)equality (owl:sameAs, owl:differentFrom)</li> +<li>class and property equivalence (owl:equivalentClass, owl:equivalentProperty)</li> +<li>class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties)</li> +<li>class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf)</li> +<li>property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty)</li> +<li>complex class definitions (owl:unionOf, owl:intersectionOf)</li> +</ul> +<p>SPIMBench uses and extends the ontologies of LDBC&rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB&rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.</p> +<p>Value and structure-based test cases are implemented using the SWING framework <a href="#references">[1]</a> on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.</p> +<p>SPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.</p> +<p>SPIMBench can be downloaded from <a href="https://github.com/jsaveta/SPIMBench">our repository</a> and a more thorough description thereof can be found on <a href="http://www.ics.forth.gr/isl/spimbench/">http://www.ics.forth.gr/isl/spimbench/</a>.</p> +<h4 id="references">References</h4> +<p>[1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + Further Developments in SNB BI Workload + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + Thu, 18 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + <p>We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.</p> +<p>As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.</p> +<p>There are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.</p> +<p>Let’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.</p> +<p>One of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.</p> +<p>Another feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.</p> +<p>Since a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.</p> +<p>An orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.</p> +<p>So, let’s see how some of these aspects could be captured in the SNB context.</p> +<p>Geography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.</p> +<p>The communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -&gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.</p> +<p>The behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.</p> +<p>In John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”</p> +<p>Analytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.</p> +<p>From a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.</p> +<p>This leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.</p> +<p>There is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.</p> + + + + + Sizing AWS Instances for the Semantic Publishing Benchmark + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + Wed, 17 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + <p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">BBC Dynamic Semantic Publishing</a> scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">wrote earlier</a>, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.</p> +<p>Lately we tested different Amazon Web Services (<a href="https://aws.amazon.com/">AWS</a>) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that <a href="https://www.ontotext.com/products/ontotext-graphdb/">GraphDB</a> experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.</p> +<h3 id="the-experiment">The Experiment</h3> +<p>For our tests we use:</p> +<ul> +<li>GraphDB Standard v6.1</li> +<li>LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: +<ul> +<li>8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also</li> +<li>50M dataset (SF1)</li> +<li>40 minutes of benchmark run time (60 seconds of warm up)</li> +</ul> +</li> +<li>5 different Amazon EC2 instances and one local server</li> +</ul> +<p>Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.</p> +<p>We use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.</p> +<p>We also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.</p> +<h3 id="the-results">The Results</h3> +<p>For the tests we measured:</p> +<ul> +<li><em>queries/s</em> for the read threads, where queries include SELECT and CONSTRUCT</li> +<li><em>updates/s</em> for the write threads, where an update operation is INSERT or DELETE</li> +<li><em>queries/$</em> and <em>updates/$</em> – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput</li> +<li><em>update/vCPU</em> – modification operations per vCPU per second</li> +</ul> +<p>Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.</p> +<p>The improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.</p> +<p>Table 1. SPB Measurement Results on AWS and Local Servers</p> +<table> +<thead> +<tr> +<th>Server Type</th> +<th>vCPUs</th> +<th>R/W Agents</th> +<th>RAM (GB)</th> +<th>&ldquo;Storage (GB, SSD)&rdquo;</th> +<th>Price USD/h</th> +<th>Queries/ sec.</th> +<th>Updates/ sec.</th> +<th>Queries/ USD</th> +<th>Updates/ USD</th> +<th>Updates/ vCPU</th> +</tr> +</thead> +<tbody> +<tr> +<td>m3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>15</td> +<td>2x 40</td> +<td>0.28</td> +<td>8.39</td> +<td>8.23</td> +<td>107 882</td> +<td>105 873</td> +<td>2.06</td> +</tr> +<tr> +<td>m3.2xlarge</td> +<td>8</td> +<td>8/2</td> +<td>30</td> +<td>2x 80</td> +<td>0.56</td> +<td>15.44</td> +<td>15.67</td> +<td>99 282</td> +<td>100 752</td> +<td>1.96</td> +</tr> +<tr> +<td>c3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>7.5</td> +<td>2x 40</td> +<td>0.21</td> +<td>7.17</td> +<td>6.78</td> +<td>122 890</td> +<td>116 292</td> +<td>1.7</td> +</tr> +<tr> +<td><strong>c3.2xlarge</strong></td> +<td><strong>8</strong></td> +<td><strong>8/2</strong></td> +<td><strong>15</strong></td> +<td><strong>2x 80</strong></td> +<td><strong>0.42</strong></td> +<td><strong>16.46</strong></td> +<td><strong>14.56</strong></td> +<td><strong>141 107</strong></td> +<td><strong>124 839</strong></td> +<td><strong>1.82</strong></td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>8/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>23.23</strong></td> +<td><strong>21.17</strong></td> +<td><strong>99 578</strong></td> +<td><strong>90 736</strong></td> +<td><strong>1.32</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>8/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>22.89</td> +<td>20.39</td> +<td>98 100</td> +<td>87 386</td> +<td>1.27</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/2</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.6</td> +<td>19.11</td> +<td>114 000</td> +<td>81 900</td> +<td>1.19</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.19</td> +<td>19.18</td> +<td>112 243</td> +<td>82 200</td> +<td>1.2</td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>14/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>30.84</strong></td> +<td><strong>16.88</strong></td> +<td><strong>132 171</strong></td> +<td><strong>72 343</strong></td> +<td><strong>1.06</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>14/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>29.67</td> +<td>17.8</td> +<td>127 157</td> +<td>76 286</td> +<td>1.11</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.11</td> +<td>32.04</td> +<td>156 712</td> +<td>135 302</td> +<td>1</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.31</td> +<td>32.07</td> +<td>157 557</td> +<td>135 429</td> +<td>1</td> +</tr> +<tr> +<td><strong>Local</strong></td> +<td><strong>32</strong></td> +<td><strong>10/2</strong></td> +<td><strong>256</strong></td> +<td><strong>8x 256</strong></td> +<td><strong>0.85</strong></td> +<td><strong>40</strong></td> +<td><strong>31.01</strong></td> +<td><strong>168 916</strong></td> +<td><strong>130 952</strong></td> +<td><strong>0.97</strong></td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.39</td> +<td>26.42</td> +<td>153 672</td> +<td>111 569</td> +<td>0.83</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.22</td> +<td>26.39</td> +<td>152 954</td> +<td>111 443</td> +<td>0.82</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>20/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>34.59</td> +<td>23.86</td> +<td>146 070</td> +<td>100 759</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<h3 id="the-optimal-number-of-test-agents">The Optimal Number of Test Agents</h3> +<p>Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:</p> +<ul> +<li>There is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads;</li> +<li>Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17);</li> +<li>Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations;</li> +<li>For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server;</li> +<li>Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization.</li> +<li>There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates.</li> +</ul> +<h3 id="efficiency-and-cost">Efficiency and Cost</h3> +<p>AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.</p> +<p>Cloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.</p> +<p>$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!</p> +<p>The full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.</p> +<p>To put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:</p> +<ul> +<li><strong>100 queries/sec.</strong> handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events.</li> +<li><strong>10 updates/sec.</strong> - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour.</li> +<li><strong>$81/day</strong> is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times.</li> +</ul> + + + + + DATAGEN: a Realistic Social Network Data Generator + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + Sat, 06 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/getting-started-with-snb">Getting started with snb</a>, <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">DATAGEN: data generation for the Social Network Benchmark</a>), Arnau Prat discussed the main features and characteristics of DATAGEN: <em>realism</em>, <em>scalability</em>, <em>determinism</em>, <em>usability</em>. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the <a href="https://github.com/ldbc/ldbc_snb_datagen">instructions for generating a SNB dataset</a> and <a href="https://github.com/ldbc/ldbc_snb_implementations/tree/master/interactive/virtuoso">for loading the dataset into Virtuoso</a>. In the following sections, we analyze several aspects of the generated dataset.</p> +<h3 id="a-realistic-social-graph">A Realistic social graph</h3> +<p>One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their <em><knows></em> relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about <a href="https://www.facebook.com/notes/facebook-data-team/anatomy-of-facebook/10150388519243859">Facebook Anatomy</a>). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook&rsquo;s graph distribution.</p> +<p><img src="Cumulative-distribution.png" alt="image"> <br> +Figure 1: Cumulative distribution #friends per user</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-r" data-lang="r"><span style="display:flex;"><span><span style="color:#75715e">#R script for generating the social degree distribution </span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">#Input files: person_knows_person_*.csv</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(data.table) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(igraph) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(plotrix) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">require</span>(bit64) +</span></span><span style="display:flex;"><span>dflist <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">lapply</span>(<span style="color:#a6e22e">commandArgs</span>(trailingOnly <span style="color:#f92672">=</span> <span style="color:#66d9ef">TRUE</span>), fread, sep<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;|&#34;</span>, +</span></span><span style="display:flex;"><span> header<span style="color:#f92672">=</span>T, select<span style="color:#f92672">=</span><span style="color:#ae81ff">1</span><span style="color:#f92672">:</span><span style="color:#ae81ff">2</span>, colClasses<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;integer64&#34;</span>) +</span></span><span style="display:flex;"><span> df <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">rbindlist</span>(dflist) <span style="color:#a6e22e">setNames</span>(df, <span style="color:#a6e22e">c</span>(<span style="color:#e6db74">&#34;P1&#34;</span>, <span style="color:#e6db74">&#34;P2&#34;</span>)) +</span></span><span style="display:flex;"><span>d2 <span style="color:#f92672">&lt;-</span> df[,<span style="color:#a6e22e">length</span>(P2),by<span style="color:#f92672">=</span>P1] +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">pdf</span>(<span style="color:#e6db74">&#34;socialdegreedist.pdf&#34;</span>) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">plot</span>(<span style="color:#a6e22e">ecdf</span>(d2<span style="color:#f92672">$</span>V1),main<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Cummulative distribution #friends per user&#34;</span>, +</span></span><span style="display:flex;"><span> xlab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Number of friends&#34;</span>, ylab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Percentage number of users&#34;</span>, log<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;x&#34;</span>, +</span></span><span style="display:flex;"><span> xlim<span style="color:#f92672">=</span><span style="color:#a6e22e">c</span>(<span style="color:#ae81ff">0.8</span>, <span style="color:#a6e22e">max</span>(d2<span style="color:#f92672">$</span>V1) <span style="color:#f92672">+</span> <span style="color:#ae81ff">20</span>)) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">dev.off</span>() +</span></span></code></pre></div><h3 id="data-correlations">Data Correlations</h3> +<p>Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.</p> +<p><em>Which are the most popular names of a country?</em></p> +<p>We run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, <em>&lsquo;A_country_name&rsquo;</em> is the name of a particular country such as <em>&lsquo;Germany&rsquo;, &lsquo;Netherlands&rsquo;, or &lsquo;Vietnam&rsquo;</em>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> p_lastname, <span style="color:#66d9ef">count</span> (p_lastname) <span style="color:#66d9ef">as</span> namecnt +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">FROM</span> person, country +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> p_placeid <span style="color:#f92672">=</span> ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;A_country_name&#39;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> p_lastname <span style="color:#66d9ef">order</span> <span style="color:#66d9ef">by</span> namecnt <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as <em>Muller</em> is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names&rsquo; distribution may not be exactly the same as the contemporary names&rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.</p> +<p><img src="distribution-germany.png" alt="image"> <br> +Figure 2. Distribution of names in Germany</p> +<p><img src="distribution-netherlands.png" alt=""> <br> +Figure 3. Distribution of names in Netherlands</p> +<p><img src="distribution-vietnam.png" alt=""> <br> +Figure 4. Distribution of names in Vietnam</p> +<p><em>Where my friends are living?</em></p> +<p>We run the following query, which computes the locations of the friends of people living in China.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> fctry.ctry_name, <span style="color:#66d9ef">count</span> (<span style="color:#f92672">*</span>) <span style="color:#66d9ef">from</span> person <span style="color:#66d9ef">self</span>, person +</span></span><span style="display:flex;"><span>friend, country pctry, knows, country fctry +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> pctry.ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;China&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> <span style="color:#66d9ef">self</span>.p_placeid <span style="color:#f92672">=</span> pctry.ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> <span style="color:#66d9ef">self</span>.p_personid <span style="color:#66d9ef">and</span> friend.p_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> fctry.ctry_city <span style="color:#f92672">=</span> friend.p_placeid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> fctry.ctry_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.</p> +<p><img src="chinese-friends.png" alt=""> <br> +Figure 5. Locations of friends of people in China</p> +<p><em>Where my friends are studying?</em></p> +<p>Finally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> o2.o_name, <span style="color:#66d9ef">count</span>(o2.o_name) <span style="color:#66d9ef">from</span> knows, person_university +</span></span><span style="display:flex;"><span>p1, person_university p2, organisation o1, organisation o2 +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> +</span></span><span style="display:flex;"><span> p1.pu_organisationid <span style="color:#f92672">=</span> o1.o_organisationid +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> o1.o_name<span style="color:#f92672">=</span><span style="color:#e6db74">&#39;Hangzhou_International_School&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> p1.pu_personid <span style="color:#66d9ef">and</span> p2.pu_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> p2.pu_organisationid <span style="color:#f92672">=</span> o2.o_organisationid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> o2.o_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).</p> +<p><img src="friends-international-school.png" alt=""> <br> +Figure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.</p> +<table> +<thead> +<tr> +<th>Name</th> +<th># of friends</th> +</tr> +</thead> +<tbody> +<tr> +<td>Hangzhou_International_School</td> +<td>12696</td> +</tr> +<tr> +<td>Anhui_University_of_Science_and_Technology</td> +<td>4071</td> +</tr> +<tr> +<td>China_Jiliang_University</td> +<td>3519</td> +</tr> +<tr> +<td>&hellip;</td> +<td></td> +</tr> +<tr> +<td>Darmstadt_University_of_Applied_Sciences</td> +<td>1</td> +</tr> +<tr> +<td>Calcutta_School_of_Tropical_Medicine</td> +<td>1</td> +</tr> +<tr> +<td>Chettinad_Vidyashram</td> +<td>1</td> +</tr> +<tr> +<td>Women&rsquo;s_College_Shillong</td> +<td>1</td> +</tr> +<tr> +<td>Universitas_Nasional</td> +<td>1</td> +</tr> +</tbody> +</table> +<p>Table 1. Universities where friends of Hangzhou International School students are studying at.</p> +<p>In a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.</p> + + + + + SNB Driver - Part 1 + https://ldbcouncil.org/post/snb-driver-part-1/ + Thu, 27 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-1/ + <p>In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: <a href="https://github.com/ldbc/ldbc_driver/">https://github.com/ldbc/ldbc_driver/</a>. Multiple reference implementations by two vendors are available here: <a href="https://github.com/ldbc/ldbc_snb_implementations">https://github.com/ldbc/ldbc_snb_implementations</a>, and discussion of the schema, data properties, and related content is available here: <a href="https://github.com/ldbc/ldbc_snb_docs">https://github.com/ldbc/ldbc_snb_docs</a>.</p> +<p>The following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.</p> +<h3 id="problem-description">Problem Description</h3> +<p>The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person&rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.</p> +<p>Consider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.</p> +<p>The next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.</p> + + + + + Making Semantic Publishing Execution Rules + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + Tue, 18 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + <p><a href="https://ldbcouncil.org/">LDBC</a> <a href="https://ldbcouncil.org/benchmarks/spb">SPB (Semantic Publishing Benchmark)</a> is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC&rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an <a href="https://www.ontotext.com/products/ontotext-graphdb-owlim/">Ontotext Graph DB</a> deployment. Graph DB was formerly known as Owlim.</p> +<p>So, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.</p> +<p>SPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:</p> +<ul> +<li> +<p>Some queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.</p> +</li> +<li> +<p>Some queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.</p> +</li> +</ul> +<p>These different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.</p> +<p>Another guiding factor of SPB was the BBC&rsquo;s and others&rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.</p> +<p>Normally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:</p> +<ul> +<li> +<p>Updates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.</p> +</li> +<li> +<p>Short queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.</p> +</li> +<li> +<p>Analytics - These cover a large fraction of the dataset and are roughly linear to data size.</p> +</li> +</ul> +<p>A test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.</p> +<p>The benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:</p> +<ul> +<li> +<p>46886 MB of RDF literal text</p> +</li> +<li> +<p>23924 MB of full text index for RDF literals</p> +</li> +<li> +<p>23598 MB of URI strings</p> +</li> +<li> +<p>21981 MB of quads, stored column-wise with default index scheme</p> +</li> +</ul> +<p>Clearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.</p> +<p>Let&rsquo;s now look at a full run at unit scale, i.e. 50M triples.</p> +<p>The run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.</p> +<p>The interactive driver reports:</p> +<pre tabindex="0"><code>Seconds run : 2144 + Editorial: + 2 agents + + 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) + 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) + 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) + + 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) + 39.7122 average operations per second + + Aggregation: + 20 agents + + 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) + 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) + 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) + 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) + 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) + 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) + 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) + 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) + 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) + 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) + 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) + + 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] + + 45318 total retrieval queries (0 timed-out) + 22.5239 average queries per second +</code></pre><p>The analytical driver reports:</p> +<pre tabindex="0"><code>Aggregation: + 2 agents + + 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) + 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) + 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) + 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) + 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) + 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) + 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) + 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) + 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) + 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) + 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) + 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) + 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) + 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) + 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] + + 180 total retrieval queries (2 timed-out) + 0.0862 average queries per second +</code></pre><p>The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)</p> +<p>The SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (<a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack at github.com</a>, <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics</a>).</p> +<p>The minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.</p> +<p>Is this good or bad? I would say that this is possible but better can certainly be accomplished.</p> +<p>The initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.</p> +<p>As an initial comment of the workload mix, I&rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.</p> +<p>Adjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.</p> +<p>In the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.</p> + + + + + Fifth TUC Meeting + https://ldbcouncil.org/event/fifth-tuc-meeting/ + Fri, 14 Nov 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fifth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its fifth Technical User<br> +Community (TUC) meeting.</p> +<p>This will be a one-day event at the National Hellenic Research Institute<br> +in Athens, Greece on <strong>Friday November 14, 2014</strong>.</p> +<h3 id="agenda">Agenda</h3> +<p>10:30 - 11:00 Coffee Break</p> +<p>11:00 - 11:10 Peter Boncz (VUA) Welcome &amp; LDBC project status update (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979841.pptx">Presentation</a>)</p> +<p>11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status</p> +<p>Feedback &amp; Roadmap for SPB &amp; OWLIM (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979839.pdf">Presentation</a>)</p> +<p>11:25 - 11:30 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SPB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979828.pdf">Presentation</a>)</p> +<p>11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback &amp; Roadmap for SNB &amp; Neo4J (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979830.pdf">Presentation</a>)</p> +<p>11:45 - 12:00 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SNB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979829.pdf">Presentation</a>)</p> +<p>12:00 - 12:20 Arnau Prat (UPC) &amp; Andrey Gubichev Status, Feedback &amp; Roadmap for SNB Interactive &amp; Sparksee (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979836.pdf">Presentation</a> ) and Business Intelligence (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979837.pdf">Presentation</a>)</p> +<p>12:20 - 12:40 Tomer Sagi, &ldquo;Experience with SNB and TitanDB at HP&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979838.pptx">Presentation</a> )</p> +<p>12:40 - 13:00 Jakob Nelson, &ldquo;graphbench.org on the SNB datagen&rdquo;</p> +<p>13:00 - 14:30 Lunch Break@Byzantine &amp; Christian Museum (<a href="http://www.byzantinemuseum.gr/en/">link</a>)</p> +<p>14:30 - 14:50 Olaf Hartig, &ldquo;Integrating the Property Graph and RDF data models&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979831.pdf">Presentation</a>)\</p> +<p>Documents: <a href="http://arxiv.org/abs/1409.3288">arxiv/1409.3288</a>, <a href="http://arxiv.org/abs/1406.3399">arxiv/1406.3399</a></p> +<p>14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, &ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979842.pdf">Presentation</a>)</p> +<p>15:10 - 15:30 Evaggelia Pitoura, &ldquo;Historical Queries on Graphs&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979835.pdf">Presentation</a>)</p> +<p>15:30 - 16:00 Coffee Break</p> +<p>16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, &ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979843.pdf">Presentation</a>)</p> +<p>16:20 - 16:40 Gunes Aluc, &ldquo;WatDiv: How to Tune-up your RDF Data Management System&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979832.pdf">Presentation</a>)</p> +<p>16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, &ldquo;Benchmarking @LogicBlox&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979840.pdf">Presentation</a>)</p> +<p>17:00 - 17:15 Hassan Chafi, &ldquo;Oracle Labs Graph Strategy&rdquo;</p> +<p>17:15 - 17:25 Yinglong Xia, &ldquo;Property Graphs for Industry Solution at IBM&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979834.pdf">Presentation</a>)</p> +<p>17:25 - 17:30 Arthur Keen, &ldquo;Short Introduction to SPARQLcity&rdquo;</p> +<p><em><strong>20:30 Dinner @ Konservokouti <a href="https://plus.google.com/114240752029716758955/about?gl=gr&amp;hl=en">(link)</a></strong></em></p> +<p><em><strong>Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion</strong></em></p> +<h4 id="logistics">Logistics</h4> +<p>The meeting will be held at the <a href="http://www.eie.gr/index-en.html">National Hellenic Research Foundation</a> located in <a href="http://www.eie.gr/location-en.html">downtown Athens</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/5964344.gif" alt=""></p> +<h4 id="travel">Travel</h4> +<p>Athens, Greece&rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.</p> +<p>To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).</p> +<p>You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: <a href="http://www.aia.gr/traveler/">http://www.aia.gr/traveler/</a></p> + + + + + Getting Started With the Semantic Publishing Benchmark + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + Sun, 09 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + <p>The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.</p> +<p>The benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:</p> +<ul> +<li>clustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time)</li> +<li>correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time)</li> +</ul> +<p>The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.</p> +<p>Currently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:</p> +<ul> +<li>join ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema</li> +<li>subselects that aggregate the query results that the optimiser should recognise and evaluate first</li> +<li>optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last</li> +<li>reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.)</li> +<li>unions to be executed in parallel</li> +<li>optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results</li> +<li>ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results</li> +<li>handling of geo-spatial predicates</li> +<li>full-text search optimisation</li> +<li>asynchronous execution of the aggregate sub-queries</li> +<li>use of distinct to choose the optimal query plan</li> +</ul> +<p>We give below Query 1 of the Semantic Publishing Benchmark.</p> +<pre tabindex="0"><code>PREFIX bbcevent:&lt;http://www.bbc.co.uk/ontologies/event/&gt; +PREFIX geo-pos:&lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; +PREFIX bbc:&lt;http://www.bbc.co.uk/ontologies/bbc/&gt; +PREFIX time:&lt;http://www.w3.org/2006/time#&gt; +PREFIX event:&lt;http://purl.org/NET/c4dm/event.owl#&gt; +PREFIX music-ont:&lt;http://purl.org/ontology/mo/&gt; +PREFIX rdf:&lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt; +PREFIX foaf:&lt;http://xmlns.com/foaf/0.1/&gt; +PREFIX provenance:&lt;http://www.bbc.co.uk/ontologies/provenance/&gt; +PREFIX owl:&lt;http://www.w3.org/2002/07/owl#&gt; +PREFIX cms:&lt;http://www.bbc.co.uk/ontologies/cms/&gt; +PREFIX news:&lt;http://www.bbc.co.uk/ontologies/news/&gt; +PREFIX cnews:&lt;http://www.bbc.co.uk/ontologies/news/cnews/&gt; +PREFIX cconcepts:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX dbp-prop:&lt;http://dbpedia.org/property/&gt; +PREFIX geonames:&lt;http://sws.geonames.org/&gt; +PREFIX rdfs:&lt;http://www.w3.org/2000/01/rdf-schema#&gt; +PREFIX domain:&lt;http://www.bbc.co.uk/ontologies/domain/&gt; +PREFIX dbpedia:&lt;http://dbpedia.org/resource/&gt; +PREFIX geo-ont:&lt;http://www.geonames.org/ontology#&gt; +PREFIX bbc-pont:&lt;http://purl.org/ontology/po/&gt; +PREFIX tagging:&lt;http://www.bbc.co.uk/ontologies/tagging/&gt; +PREFIX sport:&lt;http://www.bbc.co.uk/ontologies/sport/&gt; +PREFIX skosCore:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX dbp-ont:&lt;http://dbpedia.org/ontology/&gt; +PREFIX xsd:&lt;http://www.w3.org/2001/XMLSchema#&gt; +PREFIX core:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX curric:&lt;http://www.bbc.co.uk/ontologies/curriculum/&gt; +PREFIX skos:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX cwork:&lt;http://www.bbc.co.uk/ontologies/creativework/&gt; +PREFIX fb:&lt;http://rdf.freebase.com/ns/&gt; + +# Query Name : query1 +# Query Description : +# Retrieve creative works about thing t (or that mention t) +# reasoning: rdfs:subClassOf, rdf:type +# join ordering: cwork:dateModified rdf:type owl:FunctionalProperty +# join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty +# Choke Points : +# - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified +# Optimizer should use an efficient cost evaluation method for choosing the optimal join tree +# - A sub-select which aggregates results. Optimizer should recognize it and execute it first +# - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) +# Optimizer should decide to put optional triples on top of the join tree +# (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join +# - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork +# and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ + +CONSTRUCT { + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:shortTitle ?shortTitle ; + cwork:about ?about ; + cwork:mentions ?mentions ; + cwork:dateCreated ?created ; + cwork:dateModified ?modified ; + cwork:description ?description ; + cwork:primaryFormat ?primaryFormat ; + bbc:primaryContentOf ?webDocument . + ?webDocument bbc:webDocumentType ?webDocType . + ?about rdfs:label ?aboutLabel ; + bbc:shortLabel ?aboutShortLabel ; + bbc:preferredLabel ?aboutPreferredLabel . + ?mentions rdfs:label ?mentionsLabel ; + bbc:shortLabel ?mentionsShortLabel ; + bbc:preferredLabel ?mentionsPreferredLabel . + ?creativeWork cwork:thumbnail ?thumbnail . + ?thumbnail a cwork:Thumbnail ; + cwork:altText ?thumbnailAltText ; + cwork:thumbnailType ?thumbnailType . +} +WHERE { + { + SELECT ?creativeWork + WHERE { + ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . + ?creativeWork a cwork:CreativeWork ; + cwork:dateModified ?modified . + } + ORDER BY DESC(?modified) + LIMIT 10 + } + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:dateModified ?modified . + OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } + OPTIONAL { ?creativeWork cwork:description ?description . } + OPTIONAL { ?creativeWork cwork:about ?about . + OPTIONAL { ?about rdfs:label ?aboutLabel . } + OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } + OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } + } + OPTIONAL { + ?creativeWork cwork:mentions ?mentions . + OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } + OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } + OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } + } + OPTIONAL { ?creativeWork cwork:dateCreated ?created . } + OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } + OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . + OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } + OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } + } +} +</code></pre><p>Listing 1. Semantic Publishing Benchmark: Query 1</p> +<p>The benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.</p> +<p>The data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.</p> +<p>The generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).</p> +<p>Below we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.</p> +<pre tabindex="0"><code>Seconds run : 600 + Editorial: + 0 agents + + 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) + 0.0000 average operations per second + + Aggregation: + 8 agents + + 298 Q1 queries + 267 Q2 queries + 243 Q3 queries + 291 Q4 queries + 320 Q5 queries + 286 Q6 queries + 255 Q7 queries + 274 Q8 queries + 271 Q9 queries + + 2505 total retrieval queries + 4.1750 average queries per second +</code></pre><p>Listing 2. A snippet of semantic_publishing_benchmark_results.log</p> +<p>We run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:</p> +<table> +<thead> +<tr> +<th>#triples</th> +<th>Q1</th> +<th>Q2</th> +<th>Q3</th> +<th>Q4</th> +<th>Q5</th> +<th>Q6</th> +<th>Q7</th> +<th>Q8</th> +<th>Q9</th> +<th>#queries</th> +<th>avg. #q. per sec.</th> +</tr> +</thead> +<tbody> +<tr> +<td>10M</td> +<td>298</td> +<td>267</td> +<td>243</td> +<td>291</td> +<td>320</td> +<td>286</td> +<td>255</td> +<td>274</td> +<td>271</td> +<td>2505</td> +<td>41,750</td> +</tr> +<tr> +<td>100M</td> +<td>53</td> +<td>62</td> +<td>51</td> +<td>52</td> +<td>44</td> +<td>62</td> +<td>25</td> +<td>55</td> +<td>45</td> +<td>449</td> +<td>7,483</td> +</tr> +<tr> +<td>1B</td> +<td>34</td> +<td>29</td> +<td>22</td> +<td>24</td> +<td>25</td> +<td>29</td> +<td>0</td> +<td>29</td> +<td>28</td> +<td>220</td> +<td>3,667</td> +</tr> +</tbody> +</table> + + + + + Choke Point Based Benchmark Design + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + <p>The <em>Linked Data Benchmark Council</em> (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its <a href="https://ldbcouncil.org">https://ldbcouncil.org</a> website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (<a href="https://ldbcouncil.org/event/fifth-tuc-meeting">next TUC meeting</a> will be on October 5 in Athens) and indeed in <em>designing benchmarks</em>.</p> +<p>So, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by <a href="http://www.tpc.org/">TPC</a> have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are <em>relevant</em> and <em>representative</em> (address important challenges encountered in practice), <em>understandable</em> , <em>economical</em> (implementable on simple hardware), <em>fair</em> (such as not to favor a particular product or approach), <em>scalable</em>, <em>accepted</em> by the community and <em>public</em> (e.g. all of its software is available in open source). This list stems from Jim Gray&rsquo;s <a href="http://research.microsoft.com/en-us/um/people/gray/BenchmarkHandbook/TOC.htm">Benchmark Handbook</a>. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.</p> +<p>A very important aspect of benchmark development is making sure that the community <em>accepts</em> a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on <strong>fairness</strong> had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its <a href="https://ldbcouncil.org/tags/tuc-meeting/">Technical User Community gatherings</a>) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.</p> +<p>The need for <em>understandability</em> for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.</p> +<p>The <em>economical</em> aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.</p> +<p><em>Representative</em> benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the <a href="http://aksw.org/Projects/DBPSB.html">DBpedia benchmark</a> whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).</p> +<p>The fact that a benchmark can be <em>scaled</em> in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) &ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.</p> +<p>Now, what makes a benchmark <em>relevant</em>? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of <em>&ldquo;choke points&rdquo;</em>: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.</p> +<p>To illustrate what choke points are in more depth, we wrote a <a href="https://ldbcouncil.org/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf">paper in the TPCTC 2013</a> conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. <em><a href="chokepoints.png">This table</a></em> lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):</p> +<p>I would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can <a href="http://www.openlinksw.com/weblog/oerling/?id=1779">trivialize Q18</a>; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.</p> +<p>LDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark <a href="https://ldbcouncil.org/developer/spb">(SPB)</a> and the more graph-focused Social Network Benchmark (<a href="https://ldbcouncil.org/developer/snb">SNB</a>), and <a href="https://groups.google.com/forum/#!forum/ldbcouncil">tell us what you think</a>. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.</p> +<p><em>(for more posts from Peter Boncz, see also <a href="https://databasearchitects.blogspot.com">Database Architects</a>, a blog about data management challenges and techniques written by people who design and implement database systems)</em></p> + + + + + New Website Online LDBC Benchmarks Reach Public Draft + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + <p>The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.</p> +<p>The Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth <a href="https://ldbcouncil.org/event/fifth-tuc-meeting">Technical User Community meeting</a>. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.</p> +<p>You can also see that we created a this new website and a new logo. This website is different from <code>http://ldbc.eu</code> that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.</p> +<p>In the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.</p> + + + + + Social Network Benchmark Goals + https://ldbcouncil.org/post/social-network-benchmark-goals/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/social-network-benchmark-goals/ + <p>Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.</p> +<p>From a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.</p> +<p>With the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, <a href="https://ldbcouncil.org/benchmarks/snb">SNB</a>, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.</p> +<p>The SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">LDBC Github repository</a>. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD programming contest 2014</a>.</p> +<p>The SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.</p> +<p>More details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.</p> + + + + + Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + <p>It is with great pleasure that we announce the new LDBC organisation site at <a href="https://www.ldbcouncil.org">www.ldbcouncil.org</a>. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the <em>benchmarks</em> menu on this site.</p> +<p>Those benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.</p> +<p>While the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.</p> +<p>We want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.</p> +<p>Finally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.</p> +<p>In all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.</p> + + + + + 2nd International Workshop on Benchmarking RDF Systems + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + <p>Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.</p> +<p>More at: <a href="http://events.sti2.at/bersys2014/">http://events.sti2.at/bersys2014/</a></p> + + + + + DATAGEN: Data Generation for the Social Network Benchmark + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + <p>As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 <a href="#references">[1]</a>.</p> +<p>One of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.</p> +<p>For these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:</p> +<ul> +<li><strong>Realism.</strong> The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook <a href="#references">[2]</a>. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated.</li> +<li><strong>Scalability.</strong> Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters.</li> +<li><strong>Determinism.</strong> DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible.</li> +<li><strong>Usability.</strong> LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible.</li> +</ul> +<p>Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (<a href="https://github.com/ldbc/ldbc_snb_datagen)">https://github.com/ldbc/ldbc_snb_datagen)</a>.</p> +<h4 id="references">References</h4> +<p>[1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. &ldquo;S3g2: A scalable structure-correlated social graph generator.&rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.</p> +<p>[2] Prat-Pérez, Arnau, and David Dominguez-Sal. &ldquo;How community-like is the structure of synthetically generated graphs?.&rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.</p> + + + + + Getting Started With SNB + https://ldbcouncil.org/post/getting-started-with-snb/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-snb/ + <p>In a previous blog post titled &ldquo;<a href="https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/">Is SNB like Facebook&rsquo;s LinkBench?</a>&rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.</p> +<h3 id="datagen">DATAGEN</h3> +<p>DATAGEN is the data generator used by all the workloads of SNB. <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/">Here</a> we introduced the design goals that drive the development of DATAGEN, which can be summarized as: <em>Realism, Scalability, Determinism and Usability.</em></p> +<p>DATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.</p> +<p><img src="schema.png" alt="image"></p> +<p>For the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:</p> +<ul> +<li> +<p>Realistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.</p> +</li> +<li> +<p>Correlated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.</p> +</li> +</ul> +<p>DATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.</p> +<p>Finally, DATAGEN outputs two other things:</p> +<ul> +<li> +<p>Update Streams, which will be used in the future to implement updates in the workloads.</p> +</li> +<li> +<p>Substitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.</p> +</li> +</ul> +<p>Configuring and using DATAGEN is easy. Please visit <a href="https://github.com/ldbc/ldbc_snb_datagen">this page</a> for more information.</p> +<h3 id="ldbc-driver">LDBC driver</h3> +<p>SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.</p> +<p>It is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation <a href="https://github.com/ldbc/ldbc_driver/wiki">page</a>.</p> +<p>The test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.</p> +<p>Given a workload consisting of a series of <em>Operations</em>, the test sponsor implements <em>OperationHandlers</em> __ for them. <em>OperationHandlers</em> are responsible of executing instances of an specific operation (query) type. This is done by overriding the method <em>executeOperation</em>(), which receives as input parameter an <em>Operation</em> instance and returns the result. From <em>Operation</em> __ instance, the operation&rsquo;s input parameters can be retrieved, as well as the database connection state.</p> +<p>The database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the <em>Db</em> interface, which consists of three methods: <em>onInit</em>(), <em>onCleanup</em>() and <em>getConnectionState</em>(). <em>onInit</em>() is called before the benchmark is executed, and is responsible of initializing the database and registering the different <em>OperationHandlers</em>. <em>onCleanup</em>() is called after the benchmark has completed. Any resources that need to be released should be released here.</p> +<p>Finally, <em>getConnectionState</em>() returns an instance of <em>DbConnectionState</em>, which encapsulates any state that needs to be shared between <em>OperationHandler</em> instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.</p> +<p>A good example on how to implement the benchmark can be found <a href="https://github.com/ldbc/ldbc_driver/wiki/Implementing%20a%20Database%20Connector">here</a>.</p> +<h3 id="workloads">Workloads</h3> +<p>Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.</p> +<p>Interactive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.</p> +<p>Business Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.</p> +<p>Examples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.</p> +<p>Finally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.</p> +<h3 id="final-remarks">Final remarks</h3> +<p>This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification <a href="https://github.com/ldbc/ldbc_snb_docs">draft</a>, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.</p> + + + + + Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + <p>The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.</p> +<p>The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user&rsquo;s social environment and potentially access data associated with the friends or a user and their friends.</p> +<p>This is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.</p> +<p>SNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.</p> +<p>The challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.</p> +<p>The dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.</p> +<p>The metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.</p> +<p>Different technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API&rsquo;s.</p> +<p>SNB Interactive is an example of LDBC&rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art</p> +<p>The benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.</p> +<p>Subsequent posts will address the workload in more detail.</p> + + + + + Is SNB Like Facebooks LinkBench + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + <p>In this post, I will discuss in some detail the rationale and goals of the design of the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook&rsquo;s own graph benchmark called <a href="https://www.facebook.com/notes/facebook-engineering/linkbench-a-database-benchmark-for-the-social-graph/10151391496443920">LinkBench</a>. We think SNB is the most intricate graph database benchmark to date (it&rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference <a href="http://www.sigmod2014.org/">SIGMOD in Snowbird</a> after being used for this year&rsquo;s <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD Programming Contest</a>, which was about graph analytics.</p> +<p>SNB is intended to provide the following <strong>value</strong> to different stakeholders:</p> +<ul> +<li> +<p>For end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to <em>compare merits of different products</em> and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.</p> +</li> +<li> +<p>For vendors of graph database technology, SNB provides a <em>checklist of features</em> and performance characteristics that helps in product positioning and can serve to guide new development.</p> +</li> +<li> +<p>For researchers, both industrial and academic, the SNB dataset and workload provide <em>interesting challenges</em> in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.</p> +</li> +</ul> +<p>I should clarify that even though the data model of SNB resembles Facebook (and we&rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don&rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the <em>connection patterns</em> between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:</p> +<ul> +<li> +<p>the benchmark scenario should be <strong>understandable</strong> to a large audience, and this audience should also understand the relevance of managing such data.</p> +</li> +<li> +<p>the scenario in the benchmark should cover the complete range of challenges <strong>relevant</strong> for graph data management, according to the benchmark scope.</p> +</li> +<li> +<p>the query challenges in it should be <strong>realistic</strong> in the sense that, though synthetic, similar data and workloads are encountered in practice.</p> +</li> +</ul> +<p>The SNB is in fact three distinct benchmarks with a common dataset, since there are <em>three different workloads</em>. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.</p> +<ul> +<li> +<p><strong>Interactive Workload.</strong> The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j&rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system&rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a> and <a href="https://github.com/ldbc/ldbc_driver">driver software stack</a> are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.</p> +</li> +<li> +<p><strong>Business Intelligence Workload.</strong> There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.</p> +</li> +<li> +<p><strong>Graph Analytics Workload.</strong> This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.</p> +</li> +</ul> +<p>All the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a>. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to <a href="http://people.cs.uchicago.edu/~tga/pubs/sigmod-linkbench-2013.pdf">LinkBench</a>, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the <strong>low-level</strong> MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that <a href="http://borthakur.com/ftp/sigmod2013.pdf">Facebook&rsquo;s IT infrastructure</a> does not store all user data in MySQL and its modified memcached (&quot;<a href="http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/11730-atc13-bronson.pdf">TAO</a>&quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it <strong>does</strong> matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with <em>correlations</em>, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a &ldquo;choke point&rdquo; for graph data management system where LDBC wants to stimulate innovation).</p> + + + + + Making It Interactive + https://ldbcouncil.org/post/making-it-interactive/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-it-interactive/ + <p><em>Synopsis:</em> Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.</p> +<p>It is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.</p> +<p>As part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.</p> +<p>Now, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.</p> +<p>So, LDBC SNB has a twofold task:</p> +<ol> +<li>In order to be a credible interactive workload, it must in fact have characteristics of one</li> +<li>In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics.</li> +</ol> +<p>Designing a workload presents specific challenges:</p> +<ol> +<li>The workload must be realistic enough for users to identify with it.</li> +<li>The workload must pose challenges and drive innovation in a useful direction.</li> +<li>The component operations must all play a noticeable role in it. If the operation&rsquo;s relative performance doe does not affect the score, why is it in the workload?</li> +</ol> +<p>The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.</p> +<p>Very roughly, the choke points (technical challenges) of SNB interactive are as follows:</p> +<ul> +<li>Random access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored.</li> +<li>Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join.</li> +<li>When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations.</li> +<li>Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities.</li> +</ul> +<p>Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.</p> +<p>In online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.</p> +<p>A key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.</p> +<p>The other aspect is the metric, typically some variation on operations per unit of time.</p> +<p>All these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.</p> +<p>So how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.</p> +<p>But SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.</p> +<p>Having this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:</p> +<ul> +<li>Short queries should collectively be about 45% of the CPU load.</li> +<li>Updates will be under 5%</li> +<li>Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others.</li> +</ul> +<p>The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.</p> +<p>There is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.</p> +<p>In the next post we will look at the actual mix and execution times on the test system.</p> + + + + + SNB Data Generator - Getting Started + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">this</a> and <a href="https://ldbcouncil.org/post/getting-started-with-snb">this</a>) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.</p> +<h3 id="getting-and-configuring-hadoop">Getting and Configuring Hadoop</h3> +<p>DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz hadoop-1.2.1.tar.gz +</span></span></code></pre></div><p>For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/wiki/Configuration">LDBC DATAGEN wiki</a>.</p> +<h3 id="getting-and-configuring-datagen">Getting and configuring DATAGEN</h3> +<p>Before downloading DATAGEN, be sure to fulfill the following requirements:</p> +<ul> +<li>Linux based machine</li> +<li>java 1.6 or greater</li> +<li>python 2.7.X</li> +<li>maven 3</li> +</ul> +<p>After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/releases">here</a>. Again, decompress the downloaded file with the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz +</span></span></code></pre></div><p>This will create a folder called “ldbc_snb_datagen-0.1.2”.</p> +<p>DATAGEN provides a <em>run.sh</em> is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>HADOOP_HOME<span style="color:#f92672">=</span>/home/user/hadoop-1.2.1 +</span></span><span style="display:flex;"><span>LDBC_SNB_DATAGEN_HOME<span style="color:#f92672">=</span>/home/user/ldbc_snb_datagen +</span></span></code></pre></div><p>HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute <em>run.sh</em> script to compile and execute DATAGEN using default parameters. Type the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user/ldbc_snb_datagen-0.1.2 +</span></span><span style="display:flex;"><span>$ ./run.sh +</span></span></code></pre></div><p>This will run DATAGEN, and two folders will be created at the same directory: <em>social_network</em> containing the scale factor 1 dataset with csv uncompressed files, and <em>substitution_parameters</em> containing the substituion parameters needed by the driver to execute the benchmark.</p> +<h3 id="changing-the-generated-dataset">Changing the generated dataset</h3> +<p>The characteristics of the dataset to be generated are specified in the <em>params.ini</em> file. By default, this file has the following content:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:1</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:1</span> +</span></span></code></pre></div><p>The following is the list of options and their default values supported by DATAGEN:</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>scaleFactor</td> +<td>1</td> +<td>&ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000&rdquo;</td> +</tr> +<tr> +<td>serializer</td> +<td>csv</td> +<td>&ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl&rdquo;</td> +</tr> +<tr> +<td>compressed</td> +<td>FALSE</td> +<td>Specifies to compress the output data in gzip.</td> +</tr> +<tr> +<td>outputDir</td> +<td>./</td> +<td>Specifies the folder to output the data.</td> +</tr> +<tr> +<td>updateStreams</td> +<td>FALSE</td> +<td>&ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static&rdquo;</td> +</tr> +<tr> +<td>numThreads</td> +<td>1</td> +<td>Sets the number of threads to use. Only works for pseudo-distributed mode</td> +</tr> +</tbody> +</table> +<p>For instance, a possible <em>params.ini</em> file could be the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:30</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:ttl</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>numPersons</td> +<td>-</td> +<td>The number of persons to generate</td> +</tr> +<tr> +<td>numYears</td> +<td>-</td> +<td>The amount of years of activity</td> +</tr> +<tr> +<td>startYear</td> +<td>-</td> +<td>The start year of simulation.</td> +</tr> +</tbody> +</table> +<p>The following is an example of another possible <em>params.ini</em> file</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">numPersons:100000</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numYears:3</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">startYear:2010</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv_merge_foreign</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/">GitHub</a>!</p> + + + + + The Day of Graph Analytics + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + <p><em>Note: consider this post as a continuation of the &ldquo;<a href="https://ldbcouncil.org/post/making-it-interactive">Making it interactive</a>&rdquo; post by Orri Erling.</em></p> +<p>I have now completed the <a href="https://github.com/openlink/virtuoso-opensource">Virtuoso</a> TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. <a href="http://www.tpc.org/tpch/">TPC-H</a> is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.</p> +<p>So, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.</p> +<p>The BI part is like TPC-H, except for adding the following challenges:</p> +<ul> +<li> +<p>Joins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.</p> +</li> +<li> +<p>Transitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.</p> +</li> +<li> +<p>Transitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.</p> +</li> +<li> +<p>Graph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.</p> +</li> +<li> +<p>Running one query with parameters from different buckets, implying different best plan.</p> +</li> +<li> +<p>Data correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.</p> +</li> +<li> +<p>Large intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.</p> +</li> +<li> +<p>More unions and outer joins.</p> +</li> +</ul> +<p>The idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.</p> +<p>For rules and metric, we can use the TPC-H or <a href="http://www.tpc.org/tpcds/default.asp">TPC-DS</a> ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and <a href="http://www.openstreetmap.org/">Open Street Map</a>. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.</p> +<p>Doing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.</p> +<p>As a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or &ldquo;Platonic&rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.</p> +<p>So, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.</p> +<p>The distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.</p> +<p>This brings us to the graph analytics proper, which is often done in BSP style, e.g. <a href="http://es.slideshare.net/shatteredNirvana/pregel-a-system-for-largescale-graph-processing">Pregel</a>, <a href="http://giraph.apache.org">Giraph</a>, <a href="http://uzh.github.io/signal-collect/">Signal-Collect</a>, some but not all <a href="http://ppl.stanford.edu/main/green_marl.html">Green-Marl</a> applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.</p> +<p>With BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as <a href="http://graphbench.org/">graphbench.org</a> but will be adapted to the SNB dataset.</p> +<p>The analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.</p> + + + + + Using LDBC SPB to Find OWLIM Performance Issues + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + Wed, 20 Aug 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + <p>During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (LDBC-SPB) as a part of our development and release process.</p> +<p>First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.</p> +<p>Initially we’ve decided to fix some of the benchmark parameters :</p> +<ul> +<li>the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run.</li> +</ul> +<p>Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.</p> +<p>We’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.</p> +<p>Following two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.</p> +<p><img src="16-2-Performance.png" alt="image"></p> +<p>Figure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.</p> +<p><img src="8-0-Performance.png" alt="image"></p> +<p>Figure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.</p> +<p>Another thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark&rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.</p> +<p>An additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called <em>checkConformance</em>. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.</p> + + + + + Fourth TUC meeting + https://ldbcouncil.org/event/fourth-tuc-meeting/ + Thu, 03 Apr 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fourth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.</p> +<p>This will be a one-day event at CWI in Amsterdam on <em>Thursday April 3, 2014</em>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<p><strong>For presenters please limit your talks to just 15 minutes</strong></p> +<h3 id="agenda">Agenda</h3> +<p><strong>April 3rd</strong></p> +<ul> +<li> +<p>10:00 Peter Boncz (VUA) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506371.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=JYWVgrP1kVY">video</a>: <em>LDBC project status update</em></p> +</li> +<li> +<p>10:20 Norbert Martinez (UPC) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506375.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=4yREJQ3yDr0">video</a>: <em>Status update on the LDBC Social Network Benchmark (SNB) task force</em>.</p> +</li> +<li> +<p>10:50 Alexandru Iosup (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506363.ppt">ppt</a>, <a href="https://www.youtube.com/watch?v=ulT-RFwKpOE">video</a>: <em>Towards Benchmarking Graph-Processing Platforms</em></p> +</li> +<li> +<p>11:10 Mike Bryant (Kings College) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506364.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=KiHRTu9xx0A">video</a>: <em>EHRI Project: Archival Integration with Neo4j</em></p> +</li> +</ul> +<p><strong>11:30 coffee</strong></p> +<ul> +<li> +<p>11:50 Thilo Muth (University of Magdeburg) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506369.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=5xH3UDLP6Oc">video</a>: <em>MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis</em></p> +</li> +<li> +<p>12:10 Davy Suvee (Janssen Pharmaceutica / Johnson &amp; Johnson) – <a href="https://www.youtube.com/watch?v=XN3LRJUfJIU">video</a>: <em>Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph</em></p> +</li> +<li> +<p>12:30 Yongming Luo (TU Eindhoven) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506366.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=g_my3tBB2_s">video</a>: <em>Regularities and dynamics in bisimulation reductions of big graphs</em></p> +</li> +<li> +<p>12:50 Christopher Davis (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506370.pdf">pdf</a>, <a href="https://www.youtube.com/channel/UC6HbzfJ4016Vez-2HKNeDag">video</a>: <em>Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues</em></p> +</li> +</ul> +<p><strong>13:10 - 14:30 lunch @ restaurant Polder</strong></p> +<ul> +<li> +<p>14:30 <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506365.pptx">SPB task force report</a></p> +</li> +<li> +<p>15:00 Bastiaan Bijl (Sysunite) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506373.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=TsCeKDHShMY">video</a>: <em>Using a semantic approach for monitoring applications in large engineering projects</em></p> +</li> +<li> +<p>15:20 Frans Knibbe (Geodan) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506372.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=uAX-m4OewPM">video</a>: <em>Benchmarks for geographical data</em></p> +</li> +<li> +<p>15:40 Armando Stellato (University of Rome, Tor Vergata &amp; UN Food and Agriculture Organization) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506374.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=mfA4csAs72Y">video</a>: <em>VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges</em></p> +</li> +</ul> +<p><strong>16:00 coffee</strong></p> +<ul> +<li> +<p>16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), <a href="https://www.youtube.com/watch?v=ZUDnVw9P_Rc">video</a>:<em>Customer experiences in implementing SKOS-based vocabularymanagement systems</em></p> +</li> +<li> +<p>16:40 Simon Jupp (European Bioinformatics Institute) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506368.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=CgTuOGK92W8">video</a>: <em>[Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]</em></p> +</li> +<li> +<p>17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506381.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=QTc3yOgoEsg">video</a>: <em>Breakmarking UniProt RDF. SPARQL queries that make your database cry&hellip;</em></p> +</li> +<li> +<p>17:20 Rein van &rsquo;t Veer (Digital Heritage Netherlands) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506380.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=2vDrZoskGyQ">video</a> <em>Time and space for heritage</em></p> +</li> +<li> +<p>17:40 <strong>end of meeting</strong></p> +</li> +<li> +<p>19:00 - 21:30 Social Dinner in restaurant Boom</p> +</li> +</ul> +<p><strong>April 4th</strong></p> +<p>LDBC plenary meeting for project partners.</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506362.ppt">Benchmarking Graph-Processing Platforms: A Vision</a> – Alexandru Iosup</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p>The meeting will be held at the Dutch national research institute for computer science and mathematics (<a href="http://www.cwi.nl">CWI</a> - Centrum voor Wiskunde en Informatica). It is located at <a href="http://www.amsterdamsciencepark.nl/">Amsterdam Science Park</a>:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505821.jpg" alt=""></p> +<p>(<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505820.pdf">A5 map</a>)</p> +<h6 id="travel">Travel</h6> +<p><strong>Arriving &amp; departing:</strong></p> +<p>Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, <a href="http://www.schiphol.com/">www.schiphol.nl</a>) that serves all main European carriers and also very many low-fare carriers.</p> +<p><a href="http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane">http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane</a></p> +<p><strong>Trains</strong> (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) &ndash; which station you are also likely arriving at in case of an international train trip.</p> +<p>From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 &ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).</p> +<p><strong>Taxi</strong> is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).</p> +<p><strong>Public transportation</strong> (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.</p> +<p><strong>Only the &ldquo;disposable&rdquo; cards are interesting for you as visitor.</strong></p> +<p>Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.</p> +<p><strong>Getting Around:</strong> the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.</p> +<p><strong>Cars</strong></p> +<p>In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the &ldquo;WCW&rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.</p> +<p><strong>Arriving at CWI:</strong> Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you&rsquo;ll receive a visitor&rsquo;s pass that allows you to enter our building.</p> +<p><strong>Social Dinner</strong></p> +<p>The social dinner will take place at 7pm on April 3 in Restaurant Boom (<a href="http://www.boometenendrinken.nl/">boometenendrinken.nl</a>), Linneausstraat 63, Amsterdam.</p> + + + + + Third TUC Meeting + https://ldbcouncil.org/event/third-tuc-meeting/ + Tue, 19 Nov 2013 08:00:00 +0000 + + https://ldbcouncil.org/event/third-tuc-meeting/ + <p>The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!</p> +<p>This will be a one day event in London on the <strong>19 November 2013</strong> running in collaboration with the <a href="http://www.graphconnect.com/london/">GraphConnect</a> event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: <strong>LDBCTUC</strong>.</p> +<p>The TUC event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology</li> +<li>Industry discussions on the contents of the benchmarks</li> +</ul> +<p>We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.</p> +<p>We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a></li> +<li><a href="#ldbctuc-background">LDBC/TUC Background</a> +<ul> +<li><a href="#social-network-benchmark">Social Network Benchmark</a></li> +<li><a href="#semantic-publishing-benchmark">Semantic Publishing Benchmark</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>November 19th - Public TUC Meeting</strong></p> +<p>8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)</p> +<p>short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)</p> +<p>NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.</p> +<p>10:00 TUC Meeting Opening (Peter Boncz)</p> +<p>10:10 TUC Presentations (RDF Application Descriptions)</p> +<ul> +<li>Johan Hjerling (BBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275669.pdf">BBC Linked Data and the Semantic Publishing Benchmark</a></strong></em></li> +<li>Andreas Both (Unister): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505027.pdf">Ontology-driven applications in an e-commerce context</a></strong></em></li> +<li>Nuno Carvalho (Fujitsu Laboratories Europe): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275666.pdf"><em><strong>Fujitsu RDF use cases and benchmarking requirements</strong></em></a></li> +<li>Robina Clayphan (Europeana): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816977.ppt">Europeana and Open Data</a></strong></em></li> +</ul> +<p>11:30 Semantic Publishing Benchmark (SPB)</p> +<ul> +<li>Venelin Kotsev (Ontotext - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">Semantic Publishing Benchmark Task Force Update</a></strong></em> and <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">report</a></strong></em></li> +</ul> +<p>12:00-13:00 Lunch at the Graph Connect venue</p> +<p><em>Talks During Lunch:</em></p> +<ul> +<li>Pedro Furtado, Jorge Bernardino (Univ. Coimbra): <strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275671.pdf">KEYSTONE Cost Action</a></strong></li> +</ul> +<p>13:00 TUC Presentations (Graph Application Descriptions)</p> +<ul> +<li>Minqi Zhou / Weining Qian (East China Normal University): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275670.pdf">Elastic and realistic social media data generation</a></strong></em></li> +<li>Andrew Sherlock (Shapespace): <em><strong>Shapespace Use Case</strong></em></li> +<li>Sebastian Verheughe (Telenor): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275667.pdf">Real-time Resource Authorization</a></strong></em></li> +</ul> +<p>14:00 Social Network Benchmark (SNB)</p> +<ul> +<li>Norbert Martinez (UPC - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505025.pdf">Social Network Benchmark Task Force Update</a></strong></em> and <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816975.pdf">Report</a></li> +</ul> +<p><em>14:30 Break</em></p> +<p>14:45 TUC Presentations (Graph Analytics)</p> +<ul> +<li>Keith Houck (IBM): <em><strong>Benchmarking experiences with [System G Native Store (tentative title)]</strong></em></li> +<li>Abraham Bernstein (University of Zurich): <em><strong>Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store</strong></em></li> +<li>Luis Ceze (University of Washington): <em><strong>Grappa and GraphBench Status Update</strong></em></li> +</ul> +<p><em>15:45 Break</em></p> +<p>16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*</p> +<ul> +<li>Christian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): <em><strong>CIDOC-CRM</strong></em></li> +<li>Atanas Kiryakov (Ontotext): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275672.pdf">Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM)</a></strong></em></li> +<li>Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275668.pdf">Geographica: A Benchmark for Geospatial RDF Stores</a></strong></em></li> +<li>Xavier Lopez (Oracle): <em><strong>W3C Property Graph progress</strong></em></li> +<li>Thomas Scharrenbach (University Zurich) <em><strong>PCKS: Benchmarking Semantic Flow Processing Systems</strong></em></li> +</ul> +<p>17:20 Meeting Conclusion (Josep Larriba Pey)</p> +<p>17:30 End of TUC meeting</p> +<p>19:00 Social dinner</p> +<p><strong>November 20th - Internal LDBC Meeting</strong></p> +<p>10:00 Start</p> +<p>12:30 <em>End of meeting</em></p> +<ul> +<li>coffee and lunch provided</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p><strong>Date</strong></p> +<p>19th November 2013</p> +<p><strong>Location</strong></p> +<p>The TUC meeting will be held in <strong>The Tower</strong> hotel (<a href="http://goo.gl/qZt8Fz">Google Maps link</a>) approximately 4 minutes walk from the <a href="http://www.graphconnect.com/london/">GraphConnect</a> conference in London.</p> +<p>Getting there</p> +<ul> +<li>From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.</li> +<li>From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554995.pdf">See attached</a>.</li> +</ul> +<h3 id="ldbctuc-background">LDBC/TUC Background</h3> +<p>Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SNB_Report_Nov2013.pdf</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SPB_Report_Nov2013.pdf</a></li> +</ul> +<p>A summary of these efforts can be read below or, for a more detailed account, please refer to: <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554967.pdf">The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort</a>. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.</p> +<h4 id="social-network-benchmark">Social Network Benchmark</h4> +<p>The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:</p> +<ul> +<li>it should be understandable, and the relevance of managing such data should be understandable</li> +<li>it should cover the complete range of interesting challenges, according to the benchmark scope</li> +<li>the queries should be realistic, i.e., similar data and workloads are encountered in practice</li> +</ul> +<p>SNB includes a data generator for creation of synthetic social network data with the following characteristics:</p> +<ul> +<li>data schema is representative of real social networks</li> +<li>data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions</li> +<li>the software generator is easy-to-use, configurable and scalable</li> +</ul> +<p>SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:</p> +<ul> +<li><strong>Interactive</strong> +<ul> +<li>Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.</li> +<li>The targeted systems are expected to be those that offer transactional functionality.</li> +</ul> +</li> +<li><strong>Business Intelligence</strong> +<ul> +<li>Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.</li> +<li>The targeted systems are expected to be those that offer an abstract query language.</li> +</ul> +</li> +<li><strong>Graph Analytics</strong> +<ul> +<li>Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.</li> +<li>Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.</li> +<li>The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.</li> +</ul> +</li> +</ul> +<h4 id="semantic-publishing-benchmark">Semantic Publishing Benchmark</h4> +<p>The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.</p> +<p>The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works &ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.</p> +<p>The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.</p> +<p>Two separate workloads are modeled in SPB:</p> +<ul> +<li><strong>Editorial:</strong> Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.</li> +<li><strong>Aggregation:</strong> Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as &ldquo;dynamic&rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.</li> +</ul> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505026.pdf">Status of the Semantic Publishing Benchmark</a></p> + + + + + Second TUC Meeting + https://ldbcouncil.org/event/second-tuc-meeting/ + Mon, 22 Apr 2013 10:00:00 +0000 + + https://ldbcouncil.org/event/second-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.</p> +<p>This will be a two day event in Munich on the <strong>22/23rd April 2013</strong>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +<li><a href="#venue">Venue</a> +<ul> +<li><a href="#getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</a></li> +<li><a href="#getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</a></li> +<li><a href="#getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</a></li> +</ul> +</li> +<li><a href="#getting-there">Getting there</a></li> +<li><a href="#social-dinner">Social Dinner</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>April 22nd</strong></p> +<p>10:00 <em>Registration.</em><br> +10:30 Josep Lluis Larriba Pey (UPC) - <em>Welcome and Introduction.</em><br> +10:30 Peter Boncz (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687373.pptx">LDBC: goals and status</a></p> +<p><em>Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)</em></p> +<p>11:00 Josep Lluis Larriba Pey (UPC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687372.pdf">Social Network Benchmark Task Force</a><br> +11:30 Gustavo González (Mediapro): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687367.pdf">Graph-based User Modeling through Real-time Social Streams</a><br> +12:00 Klaus Großmann (Dshini): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687365.pdf">Neo4j at Dshini</a></p> +<p>12:30 Lunch</p> +<p><em>Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)</em></p> +<p>13:30 Barry Bishop (Ontotext): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687366.pptx">Semantic Publishing Benchmark Task Force</a><br> +14:00 Dave Rogers (BBC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687364.pptx">Linked Data Platform at the BBC</a><br> +14:30 Edward Thomas (Wolters Kluwer): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687374.pdf">Semantic Publishing at Wolters Kluwer</a></p> +<p>15:00 Coffee break</p> +<p><em>Projects Related to LDBC</em></p> +<p>15:30 Fabian Suchanek (MPI): &ldquo;YAGO: A large knowledge base from Wikipedia and WordNet&rdquo;<br> +16:00 Antonis Loziou (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687375.pptx">The OpenPHACTS approach to data integration</a><br> +16:30 Mirko Kämpf (Brox): &ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case&rdquo;</p> +<p>17:00 <em>End of first day</em></p> +<p>19:00 Social dinner</p> +<p><strong>April 23rd</strong></p> +<p><em>Industry &amp; Hardware Aspects</em></p> +<p>10:00 Xavier Lopez (Oracle): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687384.pdf">Graph Database Performance an Oracle Perspective.pdf</a><br> +10:30 Pedro Trancoso (University of Cyprus): &ldquo;Benchmarking and computer architecture: the research side&rdquo;</p> +<p>11:00 Coffee break</p> +<p><em>Future Steps and TUC feedback session</em></p> +<p>11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force<br> +12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force&quot;</p> +<p>12:30 <em>End of meeting</em></p> +<h3 id="logistics">Logistics</h3> +<h4 id="date">Date</h4> +<p>22nd and 23th April 2013</p> +<h4 id="location">Location</h4> +<p>The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:</p> +<p>LRZ (Leibniz-Rechenzentrum)<br> +Boltzmannstraße 1<br> +85748 Garching, Germany</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi and Subway <a href="http://www.in.tum.de/fileadmin/user_upload/Sonstiges/anfahrt_garching.pdf">Ubahn</a></p> +<h5 id="getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</h5> +<p>Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.</p> +<h5 id="getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</h5> +<ol> +<li> +<p>(except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.</p> +</li> +<li> +<p>S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.</p> +</li> +<li> +<p>Taxi: fare is ca. 30-40 euros.</p> +</li> +</ol> +<p>For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.</p> +<h5 id="getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</h5> +<p>The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.</p> +<p><strong>Finding LRZ@TUM</strong></p> +<p><a href="http://www.openstreetmap.org/?mlat=48.2615702464&amp;mlon=11.6686558264&amp;zoom=32">OpenStreetMap link</a></p> +<p><a href="https://maps.google.com/maps?q=48.2615702464,11.6686558264&amp;spn=0.005,0.005&amp;t=k">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687268.gif" alt=""></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687269.gif" alt=""></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying: Munich</strong> airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.</p> +<p><strong>S-Bahn:</strong> S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.</p> +<p><strong>Taxi:</strong> taxi from the airport to the city center costs approximately 50 euros</p> +<h4 id="social-dinner">Social Dinner</h4> +<p>The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)</p> +<p>Address: Hofbräuhaus, Platzl 9, Munich</p> + + + + + First TUC Meeting + https://ldbcouncil.org/event/first-tuc-meeting/ + Mon, 19 Nov 2012 09:00:00 +0100 + + https://ldbcouncil.org/event/first-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the <strong>19/20th November 2012</strong>.</p> +<p>So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:</p> +<ul> +<li>Introduction by the coordinator and technical director explaining the objectives of the LDBC project</li> +<li>Invitation to users to explain their use-cases and describe the limitations they have found in current technology</li> +<li>Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points</li> +</ul> +<p>The exact agenda will be published here as things get finalised before the event.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#slide">Slide</a> +<ul> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +</ul> +</li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.</p> +<p><strong>Day 1</strong></p> +<p>09:00 Welcome (Location: Aula Master)<br> +09:30 Project overview (Emphasis on task forces?) + Questionnaire results?<br> +10:30 Coffee break<br> +11:00 User talks (To gather information for use cases?)</p> +<p>13:00 Lunch</p> +<p>14:00 User talks (cont.)<br> +15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).<br> +16:00 Task force proposals (consortium)<br> +17:00 Finish first day</p> +<p>20:00 Social dinner</p> +<p><strong>Day 2</strong></p> +<p>10:00 Task force discussion (consortium + TUC)<br> +11:00 Coffe break<br> +11:30 Task force discussion (consortium + TUC)<br> +12:30 Summaries (Task forces, use cases, &hellip;) and actions</p> +<p>13:00 Lunch and farewell</p> +<p>15:00 LDBC Internal meeting</p> +<h3 id="slide">Slide</h3> +<p>Opening session:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686995.pptx">CWI – Peter Boncz</a> – Objectives</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687001.pdf">UPC – Larri</a> – Questionnaire</li> +</ul> +<p>User stories:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686998.pdf">BBC – Jem Rayfield</a></li> +<li>CA Technologies – Victor Muntés</li> +<li>Connected Discovery (Open Phacts) – Bryn Williams-Jones</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687003.pptx">Elsevier – Alan Yagoda</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687000.pptx">ERA7 Bioinformatics – Eduardo Pareja</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687005.pptx">Press Association – Jarred McGinnis</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687004.pptx">RJLee – David Neuer</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686994.pdf">Yale – Lec Maj</a></li> +</ul> +<p>Benchmark proposals:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686991.pdf">Publishing benchmark proposal – Ontotext – Barry Bishop</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687002.pdf">Social Network Benchmark Proposal – UPC – Larri</a></li> +</ul> +<h4 id="logistics">Logistics</h4> +<h5 id="date">Date</h5> +<p>19th and 20th November 2012</p> +<h5 id="location">Location</h5> +<p>The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<p><strong>Finding UPC</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<p><strong>Finding the meeting room</strong></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<p><strong>The locations of the airport and the city centre</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933317.jpg" alt=""></p> +<p><strong>Bus map</strong></p> + + + + + \ No newline at end of file diff --git a/benchmarks/overview/index.html b/benchmarks/overview/index.html new file mode 100644 index 00000000..b9f4c644 --- /dev/null +++ b/benchmarks/overview/index.html @@ -0,0 +1,352 @@ + + + + + Overview of LDBC Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Overview of LDBC Benchmarks

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

LDBC currently offers the following benchmarks:

+ +

Uses of LDBC benchmarks are subject to the Fair Use Policy for LDBC Benchmarks.

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/1/index.html b/benchmarks/page/1/index.html new file mode 100644 index 00000000..7060f873 --- /dev/null +++ b/benchmarks/page/1/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/ + + + + + + diff --git a/benchmarks/page/2/index.html b/benchmarks/page/2/index.html new file mode 100644 index 00000000..cfa09642 --- /dev/null +++ b/benchmarks/page/2/index.html @@ -0,0 +1,804 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + + + + +
+
+
+ +

LDBC and Apache Flink

+
Tags:
+ FLINK + , DATAGEN + , SNB + +
+
+ +

Apache Flink [1] is an open source platform for distributed stream and batch data processing. Flink’s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.

+

+

Flink offers multiple APIs to process data …

+ +
+
+ +
+ + +
+
+
+ +

Elements of Instance Matching Benchmarks: a Short Overview

+
Tags:
+ INSTANCE MATCHING + , SPB + +
+
+ +

The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not …

+ +
+
+ +
+ + +
+
+ +
+ +

In this post we will look at running the LDBC SNB on Virtuoso.

+

First, let’s recap what the benchmark is about:

+
    +
  1. +

    fairly frequent short updates, with no update contention worth mentioning

    +
  2. +
  3. +

    short random lookups

    +
  4. +
  5. +

    medium complex queries centered around a person’s social environment

    +
  6. +
+

The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an …

+ +
+
+ +
+ + +
+
+
+ +

SNB and Graphs Related Presentations at GRADES '15

+
Tags:
+ SIGMOD + , GRAPHALYTICS + , GRADES + , SNB + , DATAGEN + , WORKSHOP + +
+
+ +

Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.

+

Among the papers published in this edition we have “Graphalytics: A Big Data Benchmark for Graph-Processing Platforms”, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 2: Modeling Choices

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

​SNB Interactive is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.

+

In the case of Virtuoso, we have played with SQL and SPARQL implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/3/index.html b/benchmarks/page/3/index.html new file mode 100644 index 00000000..777fef31 --- /dev/null +++ b/benchmarks/page/3/index.html @@ -0,0 +1,791 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference

+
Tags:
+ SIGMOD + , GRADES + , SNB + , GRAPHALYTICS + , WORKSHOP + +
+
+ +

LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.

+

On the industry track, LDBC will be presenting the Social Network Benchmark Interactive …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 1: What Is SNB Interactive Really About?

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.

+

With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its …

+ +
+
+ +
+ + +
+
+
+ +

Why Do We Need an LDBC SNB-Specific Workload Driver?

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

In a previous 3-part blog series we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn’t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more …

+ +
+
+ +
+ + +
+
+
+ +

Event Driven Post Generation in Datagen

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.

+

First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), …

+ +
+
+ +
+ + +
+
+
+ +

The LDBC Datagen Community Structure

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.

+

When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and …

+ +
+
+ +
+ + +
+
+
+ +

Industry Relevance of the Semantic Publishing Benchmark

+
Tags:
+ INDUSTRY + , SPB + +
+
+ + + post/industry-relevance-of-the-semantic-publishing-benchmark/01_sf_newspapers.png +
+ +
+ +

Publishing and media businesses are going through transformation

+

I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/4/index.html b/benchmarks/page/4/index.html new file mode 100644 index 00000000..869aa22a --- /dev/null +++ b/benchmarks/page/4/index.html @@ -0,0 +1,757 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

OWL-Empowered SPARQL Query Optimization

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution …

+ +
+
+ +
+ + +
+
+
+ +

Person Activity Subgraph Features in LDBC DATAGEN

+
Tags:
+ SNB + , DATAGEN + +
+
+ +

When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 2: Tracking Dependencies Between Queries

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

The SNB Driver part 1 post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we’ll drill down deeper into the details of what it means to execute “dependent queries” during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 3: Workload Execution Putting It All Together

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

Up until now we have introduced the challenges faced when executing the LDBC SNB benchmark, as well as explained how some of these are overcome. With the foundations laid, we can now explain precisely how operations are executed.

+

Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these …

+ +
+
+ +
+ + +
+
+
+ +

Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide

+
Tags:
+ SPB + , SESAME + , RDF + , TUTORIAL + , GUIDE + +
+
+ +

Until now we have discussed several aspects of the Semantic Publishing Benchmark (SPB) such as the difference in performance between virtual and real servers configuration, how to choose an appropriate query mix for a benchmark run and our experience with using SPB in the development process of GraphDB for finding performance issues.

+

In this post we provide a step-by-step guide on how to run SPB using the Sesame RDF data store on a fresh install …

+ +
+
+ +
+ + +
+
+
+ +

Semantic Publishing Instance Matching Benchmark

+
Tags:
+ INSTANCE MATCHING + , BENCHMARK + +
+
+ +

The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.

+

The SPIMBench …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/5/index.html b/benchmarks/page/5/index.html new file mode 100644 index 00000000..0c4b70b1 --- /dev/null +++ b/benchmarks/page/5/index.html @@ -0,0 +1,757 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Further Developments in SNB BI Workload

+
Tags:
+ SNB + , BI + +
+
+ +

We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.

+

As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.

+

There are obvious marketing applications for a SNB-like dataset. There are also security …

+ +
+
+ +
+ + +
+
+
+ +

Sizing AWS Instances for the Semantic Publishing Benchmark

+
Tags:
+ SPB + , AMAZON + , EC2 + , AWS + , RDF + +
+
+ +

LDBC’s Semantic Publishing Benchmark (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: a Realistic Social Network Data Generator

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

In previous posts (Getting started with snb, DATAGEN: data generation for the Social Network Benchmark), Arnau Prat discussed the main features and characteristics of DATAGEN: realism, scalability, determinism, usability. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 1

+
Tags:
+ SNB + , DRIVER + , TPC-C + , INTERACTIVE + +
+
+ +

In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: …

+ +
+
+ +
+ + +
+
+
+ +

Making Semantic Publishing Execution Rules

+
Tags:
+ SPB + , TEST RUN + +
+
+ +

LDBC SPB (Semantic Publishing Benchmark) is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC’s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an Ontotext Graph DB deployment. Graph DB was formerly known as Owlim.

+

So, in SPB we wanted to address substantially more complex queries than the lookups that …

+ +
+
+ +
+ + +
+
+ +
+ +

The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/6/index.html b/benchmarks/page/6/index.html new file mode 100644 index 00000000..4603a7dc --- /dev/null +++ b/benchmarks/page/6/index.html @@ -0,0 +1,772 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Choke Point Based Benchmark Design

+
Tags:
+ DATABASE + , BENCHMARK + , DESIGN + +
+
+ +

The Linked Data Benchmark Council (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its https://ldbcouncil.org website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction …

+ +
+
+ +
+ + +
+
+
+ +

New Website Online LDBC Benchmarks Reach Public Draft

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be …

+ +
+
+ +
+ + +
+
+
+ +

Social Network Benchmark Goals

+
Tags:
+ SNB + , DATAGEN + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ +

Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established …

+ +
+
+ +
+ + +
+
+ +
+ +

It is with great pleasure that we announce the new LDBC organisation site at www.ldbcouncil.org. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its …

+ +
+
+ +
+ + +
+
+
+ +

2nd International Workshop on Benchmarking RDF Systems

+
Tags:
+ WORKSHOP + , CFP + , BENCHMARK + , BERSYS + +
+
+ +

Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: Data Generation for the Social Network Benchmark

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 [1].

+

One of the most …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/7/index.html b/benchmarks/page/7/index.html new file mode 100644 index 00000000..849afc68 --- /dev/null +++ b/benchmarks/page/7/index.html @@ -0,0 +1,787 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Getting Started With SNB

+
Tags:
+ SNB + , INTERACTIVE + , DATAGEN + +
+
+ +

In a previous blog post titled “Is SNB like Facebook’s LinkBench?”, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.

+

DATAGEN

+

DATAGEN is the data generator used by all the workloads of SNB. Here we introduced the …

+ +
+
+ +
+ + +
+
+ +
+ +

The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.

+

The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user’s social environment and potentially access data associated with the friends or a user and their friends.

+

This …

+ +
+
+ +
+ + +
+
+
+ +

Is SNB Like Facebooks LinkBench

+
Tags:
+ DEVELOPER + , SNB + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ + + post/is-snb-like-facebooks-linkbench/SNB-workloads-vs-systems.jpg +
+ +
+ +

In this post, I will discuss in some detail the rationale and goals of the design of the Social Network Benchmark (SNB) and explain how it relates to real …

+ +
+
+ +
+ + +
+
+
+ +

Making It Interactive

+
Tags:
+ SNB + , BENCHMARKING + , TPC + , SPARQL + , INTERACTIVE + +
+
+ +

Synopsis: Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.

+

It is about to be showtime for LDBC. The initial installment of the …

+ +
+
+ +
+ + +
+
+
+ +

SNB Data Generator - Getting Started

+
Tags:
+ DATAGEN + , SNB + , SOCIAL NETWORK + +
+
+ +

In previous posts (this and this) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.

+

Getting and Configuring Hadoop

+

DATAGEN runs on top of hadoop 1.2.1 to be scale. …

+ +
+
+ +
+ + +
+
+
+ +

The Day of Graph Analytics

+
Tags:
+ ANALYTICS + , SNB + +
+
+ +

Note: consider this post as a continuation of the “Making it interactive” post by Orri Erling.

+

I have now completed the Virtuoso TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. TPC-H is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/page/8/index.html b/benchmarks/page/8/index.html new file mode 100644 index 00000000..acd8ceea --- /dev/null +++ b/benchmarks/page/8/index.html @@ -0,0 +1,658 @@ + + + + + Benchmarks + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Benchmarks

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Using LDBC SPB to Find OWLIM Performance Issues

+
Tags:
+ LDBC + , SPB + , RDF + +
+
+ +

During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC Semantic Publishing Benchmark (LDBC-SPB) as a part of our development and release process.

+

First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.

+

Initially we’ve decided to fix some of the benchmark parameters :

+
    +
  • the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup …
+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/snb-bi/index.html b/benchmarks/snb-bi/index.html new file mode 100644 index 00000000..bf4b3c24 --- /dev/null +++ b/benchmarks/snb-bi/index.html @@ -0,0 +1,562 @@ + + + + + LDBC SNB Business Intelligence workload + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + + + + + +
+
+
+
+ + +

LDBC SNB Business Intelligence workload

+ + + + +
+
+
+
+ + + + + +
+ + + +
+
+
+
+

The LDBC SNB Business Intelligence workload is focusing on aggregation- and join-heavy complex queries touching a large portion of the graph with microbatches of insert/delete operations. Its data sets are available in Cloudflare R2.

+

For an overview of the workload, see the VLDB 2023 paper and its presentation by Gabor Szarnyas.

+

The workload produces scoring metrics for performance (power and throughput scores) at the given scale and price/performance metrics. The full disclosure reports (FDR) further break down the composition of the metric into its constituent parts, e.g. single query execution times.

+

Note that the system cost is the sum of the license, hardware, and maintenance costs, where maintenance means 24/7 support with a response time of less than 4 hours.

+
+ +

Audited results

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Benchmark setupSFHardwarePerformance metrics and costDocuments
+ + 30,00072 ecs.r7.16xlarge instances:
64×Intel Xeon Platinum 8369B vCPUs, 
512 GiB RAM per instance
+
    +
  • Power@SF: 111,775.39
  • +
  • Power@SF (price‑adjusted): 19.79
  • +
  • Throughput@SF: 56,920.48
  • +
  • Throughput@SF (price‑adjusted): 10.08
  • +
+
+ +
+
    +
  • System: TigerGraph 3.7.0
  • +
  • Test sponsor: TigerGraph
  • +
  • Date:
  • + + +
  • Query language: GSQL
  • +
  • System cost: 142,815 USD
  • +
+
1001 AWS r6a.4xlarge instance:
16×AMD EPYC 7R13 vCPUs,
128GiB RAM
+
    +
  • Power@SF: 6,253.72
  • +
  • Power@SF (price‑adjusted): 43.79
  • +
  • Throughput@SF: 3,723.44
  • +
  • Throughput@SF (price‑adjusted): 26.07
  • +
+
+ +
+
    +
  • System: TigerGraph 3.7.0
  • +
  • Test sponsor: TigerGraph
  • +
  • Date:
  • + + +
  • Query language: GSQL
  • +
  • System cost: 1,302,174 USD
  • +
+
1,0004 AWS r6a.8xlarge instances:
32×AMD EPYC 7R13 vCPUs,
256GiB RAM per instance
+
    +
  • Power@SF: 23,951.74
  • +
  • Power@SF (price‑adjusted): 18.39
  • +
  • Throughput@SF: 10,605.12
  • +
  • Throughput@SF (price‑adjusted): 8.14
  • +
+
+ +
+
    +
  • System: TigerGraph 3.7.0
  • +
  • Test sponsor: TigerGraph
  • +
  • Date:
  • + + +
  • Query language: GSQL
  • +
  • System cost: 7,871,354 USD
  • +
+
10,00048 AWS r6a.8xlarge instances:
32×AMD EPYC 7R13 vCPUs,
256GiB RAM per instance
+
    +
  • Power@SF: 89,444.50
  • +
  • Power@SF (price‑adjusted): 11.36
  • +
  • Throughput@SF: 41,025.76
  • +
  • Throughput@SF (price‑adjusted): 5.21
  • +
+
+ +
+
    +
  • System: TigerGraph 3.7.0
  • +
  • Test sponsor: TigerGraph
  • +
  • Date:
  • + + +
  • Query language: GSQL
  • +
  • System cost: 1,353,315 USD
  • +
+
1,000Dell PowerEdge 6625 with
64×AMD EPYC 9354 CPU cores and
1.5TiB RAM
+
    +
  • Power@SF: 30,990.08
  • +
  • Power@SF (price‑adjusted): 22.90
  • +
  • Throughput@SF: 12,993.85
  • +
  • Throughput@SF (price‑adjusted): 9.60
  • +
+
+ +
+ + +

The audited LDBC SNB BI results displayed above are available as a CSV file.

+

LDBC-certified auditors

+

SNB BI audits can be commissioned from the following LDBC-certified auditors:

+
    +
  • Fabian Murariu (Pometry Ltd.)
  • +
  • David Püroja
  • +
+ +
+
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/snb-interactive/index.html b/benchmarks/snb-interactive/index.html new file mode 100644 index 00000000..dc78193d --- /dev/null +++ b/benchmarks/snb-interactive/index.html @@ -0,0 +1,706 @@ + + + + + LDBC SNB Interactive workload + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + + + + + +
+
+
+
+ + +

LDBC SNB Interactive workload

+ + + + +
+
+
+
+ + + + + +
+ + + +
+
+
+
+

The LDBC SNB Interactive workload captures transactional graph processing scenario with complex read queries that access the neighbourhood of a given node in the graph and update operations that continuously insert new data in the graph. Its data sets are available in the CWI/SURF data repository.

+

The workload produces the throughput metric to characterize the performance at the given scale. The full disclosure reports (FDR) further detail the performance of the system under test by listing the data loading time and single query execution times.

+

Note that the system cost is the sum of the license, hardware, and maintenance costs, where maintenance means 24/7 support with a response time of less than 4 hours.

+
+ + +

Audited results using a declarative query language

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Benchmark setupSFHardwareThroughputDocuments
+
    +
  • System: GraphDB 10.1.1
  • +
  • Test sponsor: Ontotext
  • +
  • Date:
  • + + +
  • Query language: SPARQL
  • +
  • System cost: 216,222 USD
  • +
+
30AWS r6id.8xlarge, 256GiB RAM,
32×Intel Xeon Platinum 8375C vCPUs,
1 read thread, 1 write thread
3.04 ops/s + +
30AWS r6id.8xlarge, 256GiB RAM,
32×Intel Xeon Platinum 8375C vCPUs,
2 read threads, 2 write threads
6.76 ops/s
30AWS r6id.8xlarge, 256GiB RAM,
32×Intel Xeon Platinum 8375C vCPUs,
4 read threads, 4 write threads
12.16 ops/s
+ + + +

Audited results using an imperative language

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Benchmark setupSFHardwareThroughputDocuments
+
    +
  • System: AtlasGraph 3.0.0
  • +
  • Test sponsor: StarGraph
  • +
  • Date:
  • + + +
  • Queries implemented in: Rust stored procedures
  • +
  • System cost: 2,032,461 RMB
  • +
+
30AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
37,631.25 ops/s + +
100AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
48,764.08 ops/s
300AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
48,311.63 ops/s
+ + 30AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8175M vCPUs
33,180.87 ops/s + +
100AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8175M vCPUs
33,625.36 ops/s
300AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8175M vCPUs
33,261.38 ops/s
+
    +
  • System: TuGraph 3.3.4
  • +
  • Test sponsor: Ant Group
  • +
  • Date:
  • + + +
  • Queries implemented in: C++ stored procedures
  • +
  • System cost: 277,542 RMB
  • +
+
30Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM,
64×Arm-based YiTian 710 vCPUs
16,133.08 ops/s + +
100Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM,
64×Arm-based YiTian 710 vCPUs
16,966.26 ops/s
300Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM,
64×Arm-based YiTian 710 vCPUs
13,532.62 ops/s
+
    +
  • System: TuGraph 3.2.0
  • +
  • Test sponsor: Ant Group
  • +
  • Date:
  • + + +
  • Queries implemented in: C++ stored procedures
  • +
  • System cost: 291,176 USD
  • +
+
30AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
12,252.50 ops/s + +
100AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
12,934.61 ops/s
300AWS r5d.12xlarge, 384GiB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
12,721.24 ops/s
+
    +
  • System: Galaxybase 3.3.0
  • +
  • Test sponsor: CreateLink
  • +
  • Date:
  • + + +
  • Queries implemented in: Java stored procedures
  • +
  • System cost: 263,282 USD
  • +
+
30AWS r5d.12xlarge, 372GB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
9,285.86 ops/s + +
100AWS r5d.12xlarge, 372GB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
8,501.21 ops/s
300AWS r5d.12xlarge, 372GB RAM,
48×Intel Xeon Platinum 8259CL vCPUs
8,370.52 ops/s
+
    +
  • System: TuGraph 1.1
  • +
  • Test sponsor: FMA
  • +
  • Date:
  • + + +
  • Queries implemented in: C++ stored procedures
  • +
  • System cost: 280,650 USD
  • +
+
30AWS r5d.12xlarge, 374GB RAM,
48×Intel Xeon Platinum 8175M vCPUs
5,436.47 ops/s + +
100AWS r5d.12xlarge, 374GB RAM,
48×Intel Xeon Platinum 8175M vCPUs
5,010.77 ops/s
300AWS r5d.12xlarge, 374GB RAM,
48×Intel Xeon Platinum 8175M vCPUs
4,855.52 ops/s
+ + +

Results as a CSV file

+

The audited LDBC SNB Interactive results displayed above are available as a CSV file.

+

LDBC-certified auditors

+

SNB Interactive audits can be commissioned from the following LDBC-certified auditors:

+
    +
  • Márton Búr
  • +
  • Arnau Prat-Pérez
  • +
  • David Püroja
  • +
+

Legacy audited results

+

Social Network Benchmark Interactive, version 0.2.2

+

Future workloads

+

The LDBC SNB Interactive v2 workload is currently under development. See the TPCTC 2023 paper “The LDBC Social Network Benchmark Interactive Workload v2: A Transactional Graph Query Benchmark with Deep Delete Operations” and its slide deck for details.

+ +
+
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-executive_summary.pdf new file mode 100644 index 00000000..92155d75 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-signatures.pdf new file mode 100644 index 00000000..a4c50672 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph.pdf b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph.pdf new file mode 100644 index 00000000..63b7b02b Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20221109_SF1000_tigergraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph-executive_summary.pdf new file mode 100644 index 00000000..a549af81 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph.pdf new file mode 100644 index 00000000..e8688813 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF10000_tigergraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph-executive_summary.pdf new file mode 100644 index 00000000..3e284938 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph.pdf new file mode 100644 index 00000000..8a8c06d7 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF1000_tigergraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf new file mode 100644 index 00000000..6a43472f Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph-executive_summary.pdf new file mode 100644 index 00000000..e1930c2b Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph.pdf b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph.pdf new file mode 100644 index 00000000..4e1ce728 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20230406_SF100_tigergraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-executive_summary.pdf new file mode 100644 index 00000000..49e9e3cf Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-signatures.pdf new file mode 100644 index 00000000..4b8322ac Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph.pdf b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph.pdf new file mode 100644 index 00000000..4c2c8ba5 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_BI_20231203_SF30000_tugraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF100_sparksee.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF100_sparksee.pdf new file mode 100644 index 00000000..29af3830 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF100_sparksee.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF100_virtuoso.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF100_virtuoso.pdf new file mode 100644 index 00000000..064b348c Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF100_virtuoso.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF10_sparksee.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF10_sparksee.pdf new file mode 100644 index 00000000..e3908056 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF10_sparksee.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF300_virtuoso.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF300_virtuoso.pdf new file mode 100644 index 00000000..8cd9fecd Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF300_virtuoso.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF30_sparksee.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF30_sparksee.pdf new file mode 100644 index 00000000..c97aa8eb Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF30_sparksee.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20150427_SF30_virtuoso.pdf b/benchmarks/snb/LDBC_SNB_I_20150427_SF30_virtuoso.pdf new file mode 100644 index 00000000..d18bd7ca Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20150427_SF30_virtuoso.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf new file mode 100644 index 00000000..20ba1174 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf new file mode 100644 index 00000000..808c2e18 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf new file mode 100644 index 00000000..ef07701f Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf new file mode 100644 index 00000000..33e6a074 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf new file mode 100644 index 00000000..60b2d6dc Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf new file mode 100644 index 00000000..380c2c56 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf new file mode 100644 index 00000000..93cda796 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf new file mode 100644 index 00000000..6cb96bac Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf new file mode 100644 index 00000000..61ea8cfa Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf new file mode 100644 index 00000000..1843d2a8 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf new file mode 100644 index 00000000..44f34eb1 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf new file mode 100644 index 00000000..a45d8dc1 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf new file mode 100644 index 00000000..2cb9c47c Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf new file mode 100644 index 00000000..14bc45bb Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb.pdf b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb.pdf new file mode 100644 index 00000000..107daba5 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230131_SF30_graphdb.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf new file mode 100644 index 00000000..41fc60d1 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf new file mode 100644 index 00000000..7b79ec81 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf new file mode 100644 index 00000000..f485d158 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf new file mode 100644 index 00000000..2b9fc924 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf new file mode 100644 index 00000000..bb70d112 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf differ diff --git a/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf new file mode 100644 index 00000000..e94c0647 Binary files /dev/null and b/benchmarks/snb/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf differ diff --git a/benchmarks/snb/audited-results-v0.2.2/index.html b/benchmarks/snb/audited-results-v0.2.2/index.html new file mode 100644 index 00000000..581e8fd8 --- /dev/null +++ b/benchmarks/snb/audited-results-v0.2.2/index.html @@ -0,0 +1,421 @@ + + + + + LDBC Social Network Benchmark legacy audited results + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Social Network Benchmark legacy audited results

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

SNB Interactive (version 0.2.2) audited results

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SFThroughputCostSoftwareHardwareTest SponsorDateFull Disclosure Report
10
101.20 ops/s
30,427 EURSparksee 5.1.12×Xeon 2630v3 8-core 2.4GHz, 256GB RAMSparsity Technologies SAFull Disclosure Report
30
1,287.17 ops/s
20,212 EURVirtuoso 07.50.3213 v7fasttrack2×Xeon2630 6-core 2.4GHz, 192GB RAMOpenLink SoftwareFull Disclosure Report
30
86.50 ops/s
30,427 EURSparksee 5.1.12×Xeon 2630v3 8-core 2.4GHz, 256GB RAMSparsity Technologies SAFull Disclosure Report
100
1,200.00 ops/s
20,212 EURVirtuoso 07.50.3213 v7fasttrack2×Xeon2630 6-core 2.4GHz, 192GB RAMOpenLink SoftwareFull Disclosure Report
100
81.70 ops/s
37,927 EURSparksee 5.1.12×Xeon 2630v3 8-core 2.4GHz, 256GB RAMSparsity Technologies SAFull Disclosure Report
300
635.00 ops/s
20,212 EURVirtuoso 07.50.3213 v7fasttrack2×Xeon2630 6-core 2.4GHz, 192GB RAMOpenLink SoftwareFull Disclosure Report
+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/snb/index.html b/benchmarks/snb/index.html new file mode 100644 index 00000000..9464f612 --- /dev/null +++ b/benchmarks/snb/index.html @@ -0,0 +1,366 @@ + + + + + LDBC Social Network Benchmark (LDBC SNB) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Social Network Benchmark (LDBC SNB)

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

The Social Network Benchmark (SNB) suite defines graph workloads targeting database management systems and is maintained by the LDBC SNB Task Force.

+

The benchmark suite consists of two distinct workloads on a common dataset:

+
    +
  • The Business Intelligence workload is focusing on aggregation- and join-heavy complex queries touching a large portion of the graph with microbatches of insert/delete operations. Its data sets are available in Cloudflare R2 and in the SURF/CWI repository.
  • +
  • The Interactive workload captures transactional graph processing scenario with complex read queries that access the neighbourhood of a given node in the graph and update operations that continuously insert new data in the graph. Its data sets are available in the CWI/SURF data repository.
  • +
+

For a brief overview, see our talk given at FOSDEM 2023’s graph developer room. The Social Network Benchmark’s specification can be found on arXiv.

+

Audited results

+

– SNB Business Intelligence workload

+

– SNB Interactive workload

+

Commissioning audits

+

For auditing requests, please reach out at info@ldbcouncil.org. Audits can only be commissioned by LDBC member companies by contracting any of the LDBC-certified auditors. Note that there is a 2,000 GBP auditing fee to be paid for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.

+

For a short summary of LDBC’s auditing process, including preparation steps, timelines, and pricing, see the Auditing process for the LDBC Social Network Benchmark document.

+

Use of audited results

+

Fair use policies

+

The LDBC Social Network Benchmark is subject to the LDBC Fair Use Policies.

+ + + +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/snb/ldbc-snb-work-charter.pdf b/benchmarks/snb/ldbc-snb-work-charter.pdf new file mode 100644 index 00000000..9743055c Binary files /dev/null and b/benchmarks/snb/ldbc-snb-work-charter.pdf differ diff --git a/benchmarks/snb/retrospective-report-neo4j.pdf b/benchmarks/snb/retrospective-report-neo4j.pdf new file mode 100644 index 00000000..e99d9111 Binary files /dev/null and b/benchmarks/snb/retrospective-report-neo4j.pdf differ diff --git a/benchmarks/snb/retrospective-report-oracle.pdf b/benchmarks/snb/retrospective-report-oracle.pdf new file mode 100644 index 00000000..fddf907e Binary files /dev/null and b/benchmarks/snb/retrospective-report-oracle.pdf differ diff --git a/benchmarks/snb/retrospective-report-tigergraph.pdf b/benchmarks/snb/retrospective-report-tigergraph.pdf new file mode 100644 index 00000000..8468fd07 Binary files /dev/null and b/benchmarks/snb/retrospective-report-tigergraph.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-1G-Virtuoso-09062015.pdf b/benchmarks/spb/LDBC-SPB-1G-Virtuoso-09062015.pdf new file mode 100644 index 00000000..a46849c9 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-1G-Virtuoso-09062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-1G-Virtuoso-EC2-10062015.pdf b/benchmarks/spb/LDBC-SPB-1G-Virtuoso-EC2-10062015.pdf new file mode 100644 index 00000000..def15fc9 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-1G-Virtuoso-EC2-10062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-256M-GraphDB-10062015.pdf b/benchmarks/spb/LDBC-SPB-256M-GraphDB-10062015.pdf new file mode 100644 index 00000000..1b30a9ca Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-256M-GraphDB-10062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-256M-Virtuoso-09062015.pdf b/benchmarks/spb/LDBC-SPB-256M-Virtuoso-09062015.pdf new file mode 100644 index 00000000..672c9cf7 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-256M-Virtuoso-09062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-256M-Virtuoso-EC2-10062015-v2.pdf b/benchmarks/spb/LDBC-SPB-256M-Virtuoso-EC2-10062015-v2.pdf new file mode 100644 index 00000000..34ea9e86 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-256M-Virtuoso-EC2-10062015-v2.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-64M-GraphDB-10062015.pdf b/benchmarks/spb/LDBC-SPB-64M-GraphDB-10062015.pdf new file mode 100644 index 00000000..b0d71274 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-64M-GraphDB-10062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-64M-Virtuoso-09062015.pdf b/benchmarks/spb/LDBC-SPB-64M-Virtuoso-09062015.pdf new file mode 100644 index 00000000..e5d86326 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-64M-Virtuoso-09062015.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129-executive-summary.pdf b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129-executive-summary.pdf new file mode 100644 index 00000000..58bf4f57 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129-executive-summary.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129.pdf b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129.pdf new file mode 100644 index 00000000..8357aa61 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-cluster-20230129.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129-executive-summary.pdf b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129-executive-summary.pdf new file mode 100644 index 00000000..74243ed5 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129-executive-summary.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129.pdf b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129.pdf new file mode 100644 index 00000000..0c4ea85d Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF3-GraphDB-single-machine-20230129.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129-executive-summary.pdf b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129-executive-summary.pdf new file mode 100644 index 00000000..74cbcea1 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129-executive-summary.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129.pdf b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129.pdf new file mode 100644 index 00000000..e17d3bc4 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-cluster-20230129.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129-executive-summary.pdf b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129-executive-summary.pdf new file mode 100644 index 00000000..e1e82e68 Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129-executive-summary.pdf differ diff --git a/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129.pdf b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129.pdf new file mode 100644 index 00000000..d22954fb Binary files /dev/null and b/benchmarks/spb/LDBC-SPB-SF5-GraphDB-single-machine-20230129.pdf differ diff --git a/benchmarks/spb/LDBC_SPB20_20150426_SF1_GraphDB-EE-6.2b.pdf b/benchmarks/spb/LDBC_SPB20_20150426_SF1_GraphDB-EE-6.2b.pdf new file mode 100644 index 00000000..dc61ff00 Binary files /dev/null and b/benchmarks/spb/LDBC_SPB20_20150426_SF1_GraphDB-EE-6.2b.pdf differ diff --git a/benchmarks/spb/LDBC_SPB20_20150426_SF3_GraphDB-EE-6.2b.pdf b/benchmarks/spb/LDBC_SPB20_20150426_SF3_GraphDB-EE-6.2b.pdf new file mode 100644 index 00000000..f4865212 Binary files /dev/null and b/benchmarks/spb/LDBC_SPB20_20150426_SF3_GraphDB-EE-6.2b.pdf differ diff --git a/benchmarks/spb/index.html b/benchmarks/spb/index.html new file mode 100644 index 00000000..bbc9ec0b --- /dev/null +++ b/benchmarks/spb/index.html @@ -0,0 +1,594 @@ + + + + + LDBC Semantic Publishing Benchmark (LDBC SPB) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Semantic Publishing Benchmark (LDBC-SPB)

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

The Semantic Publishing Benchmark (SPB) is an LDBC benchmark for testing the performance of RDF engines inspired by the Media/Publishing industry. In particular, LDBC worked with British Broadcasting Corporation BBC to define this benchmark, for which BBC donated workloads, ontologies and data. The publishing industry is an area where significant adoption of RDF is taking place.

+

There have been many academic benchmarks for RDF but none of these are truly industrial-grade. The SPB combines a set of complex queries under inference with continuous updates and special failover tests for systems implementing replication.

+

SPB performance is measured by producing a workload of CRUD (Create, Read, Update, Delete) operations which are executed simultaneously. The benchmark offers a data generator that uses real reference data to produce datasets of various sizes and tests the scalability aspect of RDF systems. The benchmark workload consists of (a) editorial operations that add new data, alter or delete existing (b) aggregation operations that retrieve content according to various criteria. The benchmark also tests conformance for various rules inside the OWL2-RL rule-set.

+

The SPB specification contains the description of the benchmark and the data generator and all information about its software components can be found on the SPB developer page.

+

Semantic Publishing Benchmark (SPB) Audited Results

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SFTriplesRW AgentsInteractive (Q/sec)Updates (ops/sec)CostSoftwareHardwareTest SponsorDateFDR
3
256M
16 / 4
335.48
25.66
177,474 USD
GraphDB EE 10.0.1AWS r6id.8xlargeOntotext ADFDR, summary
3
256M
24 / 0
413.16
0.00
207,474 USD
GraphDB EE 10.0.1AWS r6id.8xlargeOntotext ADFDR, summary
3
256M
64 / 4
1121.76
9.53
652,422 USD
GraphDB EE 10.0.13×AWS r6id.8xlargeOntotext ADFDR, summary
3
256M
64 / 0
985.63
0.00
562,422 USD
GraphDB EE 10.0.13×AWS r6id.8xlargeOntotext ADFDR, summary
5
1B
16 / 4
105.76
10.45
177,474 USD
GraphDB EE 10.0.1AWS r6id.8xlargeOntotext ADFDR, summary
5
1B
24 / 0
158.10
0.00
207,474 USD
GraphDB EE 10.0.1AWS r6id.8xlargeOntotext ADFDR, summary
5
1B
64 / 4
372.56
4.04
652,422 USD
GraphDB EE 10.0.13×AWS r6id.8xlargeOntotext ADFDR, summary
5
1B
64 / 0
408.68
0.00
562,422 USD
GraphDB EE 10.0.13×AWS r6id.8xlargeOntotext ADFDR, summary
1
64M
8 / 2
100.85
10.19
37,504 EUR
GraphDB EE 6.2Intel Xeon E5-1650v3 6×3.5Ghz, 96GB RAMOntotext ADFDR
1
64M
8 / 2
142.76
10.67
35,323 EUR
GraphDB SE 6.3 alphaIntel Xeon E5-1650v3 6×3.5GHz, 64GB RAMOntotext ADFDR
3
256M
8 / 2
29.90
9.50
37,504 EUR
GraphDB EE 6.2Intel Xeon E5-1650v3 6×3.5Ghz, 96GB RAMOntotext ADFDR
3
256M
8 / 2
54.64
9.50
35,323 EUR
GraphDB SE 6.3 alphaIntel Xeon E5-1650v3 6×3.5GHz, 64GB RAMOntotext ADFDR
1
64M
22 / 2
149.04
156.83
20,213 USD
Virtuoso v7.50.3213Intel Xeon E5-2630 6×2.30GHz, 192 GB RAMOpenLink SoftwareFDR
3
256M
22 / 2
80.62
92.71
20,213 USD
Virtuoso v7.50.3213Intel Xeon E5-2630 6×2.30GHz, 192 GB RAMOpenLink SoftwareFDR
3
256M
30 / 3
115.38
109.85
24,528 USD
Virtuoso v7.50.3213AWS r3.8xlargeOpenLink SoftwareFDR
5
1B
22 / 2
32.28
72.72
20,213 USD
Virtuoso v7.50.3213Intel Xeon E5-2630 6×2.30GHz, 192 GB RAMOpenLink SoftwareFDR
5
1B
30 / 3
45.81
55.45
24,528 USD
Virtuoso v7.50.3213AWS r3.8xlargeOpenLink SoftwareFDR
+

LDBC-certified auditors

+

SPB audits can be commissioned from the following LDBC-certified auditors:

+
    +
  • Pjotr Scholtze
  • +
+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/benchmarks/spb/ldbc-spb-v2.0-specification.pdf b/benchmarks/spb/ldbc-spb-v2.0-specification.pdf new file mode 100644 index 00000000..a91e5a91 Binary files /dev/null and b/benchmarks/spb/ldbc-spb-v2.0-specification.pdf differ diff --git a/constitutional-documents/index.html b/constitutional-documents/index.html new file mode 100644 index 00000000..712eccf0 --- /dev/null +++ b/constitutional-documents/index.html @@ -0,0 +1,357 @@ + + + + + Constitutional Documents + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Constitutional Documents

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

This page contains LDBC’s constitutional documents: its Articles of Association and Byelaws.

+

Current versions

+ +

Old versions

+ + +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/css/custom-style.css b/css/custom-style.css new file mode 100644 index 00000000..e69de29b diff --git a/data/snb_bi.csv b/data/snb_bi.csv new file mode 100644 index 00000000..28b97f0b --- /dev/null +++ b/data/snb_bi.csv @@ -0,0 +1,6 @@ +date|test_sponsor_name|test_sponsor_url|software_name|software_version|software_url|specification_version|specification_url|driver_version|driver_url|full_disclosure_report|executive_summary|signatures|supplementary_package|query_implementation|scale_factor|power_at_sf|power_at_sf_price_adjusted|throughput_at_sf|throughput_at_sf_price_adjusted|price|currency|hardware +2023-12-03|TuGraph, Ant Yunchuang Digital Technology (Beijing) Co., Ltd.|https://www.tugraph.org/|TuGraph|0.9|https://www.tugraph.org/|2.2.2|https://arxiv.org/pdf/2001.02299v7.pdf|1.0.3|https://github.com/ldbc/ldbc_snb_bi/releases/tag/v1.0.3|LDBC_SNB_BI_20231203_SF30000_tugraph.pdf|LDBC_SNB_BI_20231203_SF30000_tugraph-executive_summary.pdf|LDBC_SNB_BI_20231203_SF30000_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_BI_20231203_SF30000_tugraph-attachments.tar.gz|Gremlin|30000|111775.39|19.79|56920.48|10.08|5649500|USD|72 ecs.r7.16xlarge instances: 64×Intel Xeon Platinum 8369B vCPUs, 512 GiB RAM per instance +2023-04-06|TigerGraph|https://tigergraph.com/|TigerGraph|3.7.0|https://docs.tigergraph.com/tigergraph-server/3.7|2.2.0|https://arxiv.org/pdf/2001.02299v7.pdf|1.0.3|https://github.com/ldbc/ldbc_snb_bi/releases/tag/v1.0.3|LDBC_SNB_BI_20230406_SF100_tigergraph.pdf|LDBC_SNB_BI_20230406_SF100_tigergraph-executive_summary.pdf|LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-attachments.tar.gz|GSQL|100|6253.72|43.79|3723.44|26.07|142815|USD|1 AWS r6a.4xlarge instance: 16×AMD EPYC 7R13 vCPUs, 128GiB RAM +2023-04-06|TigerGraph|https://tigergraph.com/|TigerGraph|3.7.0|https://docs.tigergraph.com/tigergraph-server/3.7|2.2.0|https://arxiv.org/pdf/2001.02299v7.pdf|1.0.3|https://github.com/ldbc/ldbc_snb_bi/releases/tag/v1.0.3|LDBC_SNB_BI_20230406_SF1000_tigergraph.pdf|LDBC_SNB_BI_20230406_SF1000_tigergraph-executive_summary.pdf|LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-attachments.tar.gz|GSQL|1000|23951.74|18.39|10605.12|8.14|1302174|USD|4 AWS r6a.8xlarge instances: 32×AMD EPYC 7R13 vCPUs, 256GiB RAM per instance +2023-04-06|TigerGraph|https://tigergraph.com/|TigerGraph|3.7.0|https://docs.tigergraph.com/tigergraph-server/3.7|2.2.0|https://arxiv.org/pdf/2001.02299v7.pdf|1.0.3|https://github.com/ldbc/ldbc_snb_bi/releases/tag/v1.0.3|LDBC_SNB_BI_20230406_SF10000_tigergraph.pdf|LDBC_SNB_BI_20230406_SF10000_tigergraph-executive_summary.pdf|LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_BI_20230406_SF100_SF1000_SF10000_tigergraph-attachments.tar.gz|GSQL|10000|89444.50|11.36|41025.76|5.21|7871354|USD|48 AWS r6a.8xlarge instances: 32×AMD EPYC 7R13 vCPUs, 256GiB RAM per instance +2022-11-09|TigerGraph|https://tigergraph.com/|TigerGraph|3.7.0|https://docs.tigergraph.com/tigergraph-server/3.7|2.2.0|https://arxiv.org/pdf/2001.02299v7.pdf|1.0.2|https://github.com/ldbc/ldbc_snb_bi/releases/tag/v1.0.2|LDBC_SNB_BI_20221109_SF1000_tigergraph.pdf|LDBC_SNB_BI_20221109_SF1000_tigergraph-executive_summary.pdf|LDBC_SNB_BI_20221109_SF1000_tigergraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_BI_20221109_SF1000_tigergraph-attachments.tar.gz|GSQL|1000|30990.08|22.90|12993.85|9.60|1353315|USD|Dell PowerEdge 6625 with 64×AMD EPYC 9354 CPU cores and 1.5TiB RAM diff --git a/data/snb_interactive.csv b/data/snb_interactive.csv new file mode 100644 index 00000000..3876fa20 --- /dev/null +++ b/data/snb_interactive.csv @@ -0,0 +1,22 @@ +date|category|test_sponsor_name|test_sponsor_url|software_name|software_version|software_url|specification_version|specification_url|driver_version|driver_url|full_disclosure_report|executive_summary|signatures|supplementary_package|throughput_at_largest_sf|query_implementation|scale_factor|throughput|price|currency|hardware +2023-12-25|imperative|StarGraph|https://www.stargraph.cn/|AtlasGraph|3.0.0|https://atlasgraph.io/|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-attachments.tar.gz|48311.63|Rust stored procedures|30|37631.25|2032461|RMB|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2023-12-25|imperative|StarGraph|https://www.stargraph.cn/|AtlasGraph|3.0.0|https://atlasgraph.io/|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-attachments.tar.gz|48311.63|Rust stored procedures|100|48764.08|2032461|RMB|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2023-12-25|imperative|StarGraph|https://www.stargraph.cn/|AtlasGraph|3.0.0|https://atlasgraph.io/|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-executive_summary.pdf|LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20231225_SF30-100-300_AtlasGraph-attachments.tar.gz|48311.63|Rust stored procedures|300|48311.63|2032461|RMB|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2023-07-13|imperative|Alibaba DAMO Academy|https://damo.alibaba.com/|GraphScope Flex|0.23.0|https://github.com/alibaba/GraphScope|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230713_SF30-100-300_GraphscopeFlex-attachments.tar.gz|33261.38|C++ stored procedures|30|33180.87|99236.04|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2023-07-13|imperative|Alibaba DAMO Academy|https://damo.alibaba.com/|GraphScope Flex|0.23.0|https://github.com/alibaba/GraphScope|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230713_SF30-100-300_GraphscopeFlex-attachments.tar.gz|33261.38|C++ stored procedures|100|33625.36|99236.04|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2023-07-13|imperative|Alibaba DAMO Academy|https://damo.alibaba.com/|GraphScope Flex|0.23.0|https://github.com/alibaba/GraphScope|0.3.6|https://arxiv.org/pdf/2001.02299v1.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-executive_summary.pdf|LDBC_SNB_I_20230713_SF30-100-300_GraphScopeFlex-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230713_SF30-100-300_GraphscopeFlex-attachments.tar.gz|33261.38|C++ stored procedures|300|33261.38|99236.04|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2023-01-31|declarative|Ontotext|https://www.ontotext.com/|GraphDB|10.1.1|https://graphdb.ontotext.com/|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230131_SF30_graphdb.pdf|LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf|LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230131_SF30_graphdb-attachments.tar.gz|12.16|SPARQL|30|3.04|216222|USD|AWS r6id.8xlarge, 256GiB RAM, 32×Intel Xeon Platinum 8375C vCPUs, 1 read thread, 1 write thread +2023-01-31|declarative|Ontotext|https://www.ontotext.com/|GraphDB|10.1.1|https://graphdb.ontotext.com/|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230131_SF30_graphdb.pdf|LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf|LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230131_SF30_graphdb-attachments.tar.gz|12.16|SPARQL|30|6.76|216222|USD|AWS r6id.8xlarge, 256GiB RAM, 32×Intel Xeon Platinum 8375C vCPUs, 2 read threads, 2 write threads +2023-01-31|declarative|Ontotext|https://www.ontotext.com/|GraphDB|10.1.1|https://graphdb.ontotext.com/|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230131_SF30_graphdb.pdf|LDBC_SNB_I_20230131_SF30_graphdb-executive_summary.pdf|LDBC_SNB_I_20230131_SF30_graphdb-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230131_SF30_graphdb-attachments.tar.gz|12.16|SPARQL|30|12.16|216222|USD|AWS r6id.8xlarge, 256GiB RAM, 32×Intel Xeon Platinum 8375C vCPUs, 4 read threads, 4 write threads +2022-08-16|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.2.0|https://tech.antfin.com/products/TuGraph|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20200726_SF30-100-300_tugraph-attachments.tar.gz|12721.24|C++ stored procedures|30|12252.50|291176|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2022-08-16|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.2.0|https://tech.antfin.com/products/TuGraph|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20200726_SF30-100-300_tugraph-attachments.tar.gz|12721.24|C++ stored procedures|100|12934.61|291176|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2022-08-16|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.2.0|https://tech.antfin.com/products/TuGraph|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20220816_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20220816_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20200726_SF30-100-300_tugraph-attachments.tar.gz|12721.24|C++ stored procedures|300|12721.24|291176|USD|AWS r5d.12xlarge, 384GiB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2022-05-16|imperative|CreateLink|https://galaxybase.com/|Galaxybase|3.3.0|https://galaxybase.com/|0.3.3|https://arxiv.org/pdf/2001.02299v2.pdf|0.3.4|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.4|LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-attachments.tar.gz|8370.52|Java stored procedures|30|9285.86|263282|USD|AWS r5d.12xlarge, 372GB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2022-05-16|imperative|CreateLink|https://galaxybase.com/|Galaxybase|3.3.0|https://galaxybase.com/|0.3.3|https://arxiv.org/pdf/2001.02299v2.pdf|0.3.4|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.4|LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-attachments.tar.gz|8370.52|Java stored procedures|100|8501.21|263282|USD|AWS r5d.12xlarge, 372GB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2022-05-16|imperative|CreateLink|https://galaxybase.com/|Galaxybase|3.3.0|https://galaxybase.com/|0.3.3|https://arxiv.org/pdf/2001.02299v2.pdf|0.3.4|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.4|LDBC_SNB_I_20220516_SF30-100-300_galaxybase.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-executive_summary.pdf|LDBC_SNB_I_20220516_SF30-100-300_galaxybase-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220516_SF30-100-300_galaxybase-attachments.tar.gz|8370.52|Java stored procedures|300|8370.52|263282|USD|AWS r5d.12xlarge, 372GB RAM, 48×Intel Xeon Platinum 8259CL vCPUs +2020-07-26|imperative|FMA|https://fma-ai.cn/|TuGraph|1.1|https://fma-ai.cn/|0.3.2|https://arxiv.org/pdf/2001.02299v1.pdf|0.3.3|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.3|LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220816_SF30-100-300_tugraph-attachments.tar.gz|4855.52|C++ stored procedures|30|5436.47|280650|USD|AWS r5d.12xlarge, 374GB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2020-07-26|imperative|FMA|https://fma-ai.cn/|TuGraph|1.1|https://fma-ai.cn/|0.3.2|https://arxiv.org/pdf/2001.02299v1.pdf|0.3.3|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.3|LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220816_SF30-100-300_tugraph-attachments.tar.gz|4855.52|C++ stored procedures|100|5010.77|280650|USD|AWS r5d.12xlarge, 374GB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2020-07-26|imperative|FMA|https://fma-ai.cn/|TuGraph|1.1|https://fma-ai.cn/|0.3.2|https://arxiv.org/pdf/2001.02299v1.pdf|0.3.3|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/0.3.3|LDBC_SNB_I_20200726_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20200726_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20220816_SF30-100-300_tugraph-attachments.tar.gz|4855.52|C++ stored procedures|300|4855.52|280650|USD|AWS r5d.12xlarge, 374GB RAM, 48×Intel Xeon Platinum 8175M vCPUs +2023-01-28|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.3.4|https://github.com/tugraph-db/tugraph-db|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230128_SF30-100-300_tugraph-attachments.tar.gz|13532.62|C++ stored procedures|30|16133.08|277542|RMB|Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM, 64×Arm-based YiTian 710 vCPUs +2023-01-28|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.3.4|https://github.com/tugraph-db/tugraph-db|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230128_SF30-100-300_tugraph-attachments.tar.gz|13532.62|C++ stored procedures|100|16966.26|277542|RMB|Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM, 64×Arm-based YiTian 710 vCPUs +2023-01-28|imperative|Ant Group|https://www.antgroup.com/en|TuGraph|3.3.4|https://github.com/tugraph-db/tugraph-db|0.3.6|https://arxiv.org/pdf/2001.02299v3.pdf|1.2.0|https://github.com/ldbc/ldbc_snb_interactive_v1_driver/releases/tag/v1.2.0|LDBC_SNB_I_20230128_SF30-100-300_tugraph.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-executive_summary.pdf|LDBC_SNB_I_20230128_SF30-100-300_tugraph-signatures.pdf|https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/audits/LDBC_SNB_I_20230128_SF30-100-300_tugraph-attachments.tar.gz|13532.62|C++ stored procedures|300|13532.62|277542|RMB|Alibaba Cloud ecs.g8y.16xlarge, 256GiB RAM, 64×Arm-based YiTian 710 vCPUs diff --git a/developer-community/index.html b/developer-community/index.html new file mode 100644 index 00000000..76ac1014 --- /dev/null +++ b/developer-community/index.html @@ -0,0 +1,349 @@ + + + + + Developer Community + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Developer Community

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

If you are interested in joining our benchmark task forces, please reach out at info@ldbcouncil.org.

+

Licensing

+

Our benchmarks are licensed under the Apache Software License, Version 2.0 (license file, notice file).

+

Contributor License Agreement

+

To contribute to the LDBC repositories, we ask you to sign a CLA or become an LDBC member. These options are available for both individuals and organizations.

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/developer/finbench/index.html b/developer/finbench/index.html new file mode 100644 index 00000000..588fa8b8 --- /dev/null +++ b/developer/finbench/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/finbench/ + + + + + + diff --git a/developer/snb/index.html b/developer/snb/index.html new file mode 100644 index 00000000..722eb105 --- /dev/null +++ b/developer/snb/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/snb/ + + + + + + diff --git a/developer/spb/index.html b/developer/spb/index.html new file mode 100644 index 00000000..b1746b28 --- /dev/null +++ b/developer/spb/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/benchmarks/spb/ + + + + + + diff --git a/docs/LDBC.Articles.of.Association.ADOPTED.2021-01-14.pdf b/docs/LDBC.Articles.of.Association.ADOPTED.2021-01-14.pdf new file mode 100644 index 00000000..cfbc0f83 Binary files /dev/null and b/docs/LDBC.Articles.of.Association.ADOPTED.2021-01-14.pdf differ diff --git a/docs/LDBC.Articles.of.Association.ADOPTED.2023-03-30.pdf b/docs/LDBC.Articles.of.Association.ADOPTED.2023-03-30.pdf new file mode 100644 index 00000000..284c51d2 Binary files /dev/null and b/docs/LDBC.Articles.of.Association.ADOPTED.2023-03-30.pdf differ diff --git a/docs/LDBC.Byelaws.1.1.ADOPTED.2017-03-02.pdf b/docs/LDBC.Byelaws.1.1.ADOPTED.2017-03-02.pdf new file mode 100644 index 00000000..da96ef26 Binary files /dev/null and b/docs/LDBC.Byelaws.1.1.ADOPTED.2017-03-02.pdf differ diff --git a/docs/LDBC.Byelaws.1.2.ADOPTED.2020-07-28.pdf b/docs/LDBC.Byelaws.1.2.ADOPTED.2020-07-28.pdf new file mode 100644 index 00000000..2416cbb1 Binary files /dev/null and b/docs/LDBC.Byelaws.1.2.ADOPTED.2020-07-28.pdf differ diff --git a/docs/LDBC.Byelaws.1.3.ADOPTED.2021-01-14.pdf b/docs/LDBC.Byelaws.1.3.ADOPTED.2021-01-14.pdf new file mode 100644 index 00000000..9800be11 Binary files /dev/null and b/docs/LDBC.Byelaws.1.3.ADOPTED.2021-01-14.pdf differ diff --git a/docs/LDBC.Byelaws.1.4.ADOPTED.2023-05-02.pdf b/docs/LDBC.Byelaws.1.4.ADOPTED.2023-05-02.pdf new file mode 100644 index 00000000..b4e7f658 Binary files /dev/null and b/docs/LDBC.Byelaws.1.4.ADOPTED.2023-05-02.pdf differ diff --git a/docs/LDBC.Individual.Contributor.License.Agreement.Form-2020-10-23.pdf b/docs/LDBC.Individual.Contributor.License.Agreement.Form-2020-10-23.pdf new file mode 100644 index 00000000..7f84e331 Binary files /dev/null and b/docs/LDBC.Individual.Contributor.License.Agreement.Form-2020-10-23.pdf differ diff --git a/docs/LDBC.Membership--Joining.and.Renewing.(August.2023).pdf b/docs/LDBC.Membership--Joining.and.Renewing.(August.2023).pdf new file mode 100644 index 00000000..b826fd39 Binary files /dev/null and b/docs/LDBC.Membership--Joining.and.Renewing.(August.2023).pdf differ diff --git a/docs/LDBC.Membership.-.Joining.and.Renewing.2022.pdf b/docs/LDBC.Membership.-.Joining.and.Renewing.2022.pdf new file mode 100644 index 00000000..7c7b9c28 Binary files /dev/null and b/docs/LDBC.Membership.-.Joining.and.Renewing.2022.pdf differ diff --git a/docs/LDBC.Membership.Application.and.Renewal.Form.2021-01-14.pdf b/docs/LDBC.Membership.Application.and.Renewal.Form.2021-01-14.pdf new file mode 100644 index 00000000..aea7ecf4 Binary files /dev/null and b/docs/LDBC.Membership.Application.and.Renewal.Form.2021-01-14.pdf differ diff --git a/docs/LDBC.Membership.Application_Renewal.Form.and.incorporated.Membership.Agreement.(August.2023).pdf b/docs/LDBC.Membership.Application_Renewal.Form.and.incorporated.Membership.Agreement.(August.2023).pdf new file mode 100644 index 00000000..d5dd797c Binary files /dev/null and b/docs/LDBC.Membership.Application_Renewal.Form.and.incorporated.Membership.Agreement.(August.2023).pdf differ diff --git a/docs/LDBC.Organization.Contributor.License.Agreement.Form-2020-10-23.pdf b/docs/LDBC.Organization.Contributor.License.Agreement.Form-2020-10-23.pdf new file mode 100644 index 00000000..253b6776 Binary files /dev/null and b/docs/LDBC.Organization.Contributor.License.Agreement.Form-2020-10-23.pdf differ diff --git a/docs/ldbc-snb-auditing-process.pdf b/docs/ldbc-snb-auditing-process.pdf new file mode 100644 index 00000000..38464384 Binary files /dev/null and b/docs/ldbc-snb-auditing-process.pdf differ diff --git a/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-01--SQL_PGQ-data-model-and-graph-schema.DOI.10.54285_ldbc.QZSK3559.pdf b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-01--SQL_PGQ-data-model-and-graph-schema.DOI.10.54285_ldbc.QZSK3559.pdf new file mode 100644 index 00000000..324fec17 Binary files /dev/null and b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-01--SQL_PGQ-data-model-and-graph-schema.DOI.10.54285_ldbc.QZSK3559.pdf differ diff --git a/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-02--Introduction-to-GQL-Schema-design.DOI.10.54285_ldbc.EPWQ6741.pdf b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-02--Introduction-to-GQL-Schema-design.DOI.10.54285_ldbc.EPWQ6741.pdf new file mode 100644 index 00000000..899cbad5 Binary files /dev/null and b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-02--Introduction-to-GQL-Schema-design.DOI.10.54285_ldbc.EPWQ6741.pdf differ diff --git a/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-03--Cypher-schema-constraints-proposal.DOI.10.54285_ldbc.KKHM1756.pdf b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-03--Cypher-schema-constraints-proposal.DOI.10.54285_ldbc.KKHM1756.pdf new file mode 100644 index 00000000..e5a385e4 Binary files /dev/null and b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-03--Cypher-schema-constraints-proposal.DOI.10.54285_ldbc.KKHM1756.pdf differ diff --git a/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-04--LDBC-Property-Graph-Schema-contributions-to-WG3.DOI.10.54285_ldbc.OFJF3566.pdf b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-04--LDBC-Property-Graph-Schema-contributions-to-WG3.DOI.10.54285_ldbc.OFJF3566.pdf new file mode 100644 index 00000000..14cb184a Binary files /dev/null and b/docs/papers/LDBC-Open-Access-External-Paper-OAEP-2023-04--LDBC-Property-Graph-Schema-contributions-to-WG3.DOI.10.54285_ldbc.OFJF3566.pdf differ diff --git a/docs/papers/LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.DOI.10.54285_ldbc.TZJP7279.pdf b/docs/papers/LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.DOI.10.54285_ldbc.TZJP7279.pdf new file mode 100644 index 00000000..0b499494 Binary files /dev/null and b/docs/papers/LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.DOI.10.54285_ldbc.TZJP7279.pdf differ diff --git a/docs/papers/LDBC-Work-Charter-WC-2022-02--LDBC-Extended-Graph-Schema--LEX--Work-Charter.DOI.10.54285_ldbc.VSBC2149.pdf b/docs/papers/LDBC-Work-Charter-WC-2022-02--LDBC-Extended-Graph-Schema--LEX--Work-Charter.DOI.10.54285_ldbc.VSBC2149.pdf new file mode 100644 index 00000000..eeb23d0d Binary files /dev/null and b/docs/papers/LDBC-Work-Charter-WC-2022-02--LDBC-Extended-Graph-Schema--LEX--Work-Charter.DOI.10.54285_ldbc.VSBC2149.pdf differ diff --git a/docs/papers/datagen-deletes-grades-nda-2020.pdf b/docs/papers/datagen-deletes-grades-nda-2020.pdf new file mode 100644 index 00000000..1900a604 Binary files /dev/null and b/docs/papers/datagen-deletes-grades-nda-2020.pdf differ diff --git a/docs/papers/ldbc-acid-tpctc2020.pdf b/docs/papers/ldbc-acid-tpctc2020.pdf new file mode 100644 index 00000000..19044a2b Binary files /dev/null and b/docs/papers/ldbc-acid-tpctc2020.pdf differ diff --git a/docs/papers/ldbc-organization-tpctc2023-preprint.pdf b/docs/papers/ldbc-organization-tpctc2023-preprint.pdf new file mode 100644 index 00000000..f4e61661 Binary files /dev/null and b/docs/papers/ldbc-organization-tpctc2023-preprint.pdf differ diff --git a/docs/papers/ldbc-snb-bi-grades-nda-2018.pdf b/docs/papers/ldbc-snb-bi-grades-nda-2018.pdf new file mode 100644 index 00000000..272aad49 Binary files /dev/null and b/docs/papers/ldbc-snb-bi-grades-nda-2018.pdf differ diff --git a/docs/papers/ldbc-snb-bi-vldb-2022.pdf b/docs/papers/ldbc-snb-bi-vldb-2022.pdf new file mode 100644 index 00000000..cd354e6c Binary files /dev/null and b/docs/papers/ldbc-snb-bi-vldb-2022.pdf differ diff --git a/docs/papers/ldbc-snb-bi-vldb-2023.pdf b/docs/papers/ldbc-snb-bi-vldb-2023.pdf new file mode 100644 index 00000000..cd354e6c Binary files /dev/null and b/docs/papers/ldbc-snb-bi-vldb-2023.pdf differ diff --git a/docs/papers/ldbc-snb-interactive-sigmod-2015.pdf b/docs/papers/ldbc-snb-interactive-sigmod-2015.pdf new file mode 100644 index 00000000..bf76ed84 Binary files /dev/null and b/docs/papers/ldbc-snb-interactive-sigmod-2015.pdf differ diff --git a/docs/papers/ldbc-snb-interactive-v2-tpctc2023-preprint.pdf b/docs/papers/ldbc-snb-interactive-v2-tpctc2023-preprint.pdf new file mode 100644 index 00000000..533f885d Binary files /dev/null and b/docs/papers/ldbc-snb-interactive-v2-tpctc2023-preprint.pdf differ diff --git a/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf b/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf new file mode 100644 index 00000000..ead0d1e7 Binary files /dev/null and b/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf differ diff --git a/docs/papers/parameter-curation-tpctc2014.pdf b/docs/papers/parameter-curation-tpctc2014.pdf new file mode 100644 index 00000000..6d58915f Binary files /dev/null and b/docs/papers/parameter-curation-tpctc2014.pdf differ diff --git a/docs/papers/s3g2-scalable-data-generator-tpctc2012.pdf b/docs/papers/s3g2-scalable-data-generator-tpctc2012.pdf new file mode 100644 index 00000000..e9b1578a Binary files /dev/null and b/docs/papers/s3g2-scalable-data-generator-tpctc2012.pdf differ diff --git a/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf b/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf new file mode 100644 index 00000000..69eb527e Binary files /dev/null and b/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf differ diff --git a/docs/papers/w16045-prepub-LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.pdf b/docs/papers/w16045-prepub-LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.pdf new file mode 100644 index 00000000..77854124 Binary files /dev/null and b/docs/papers/w16045-prepub-LDBC-Technical-Report-TR-2021-01--Property-graphs-and-paths-in-GQL-Mathematical-definitions.pdf differ diff --git a/docs/presentations/graph-databases-2021-11.pdf b/docs/presentations/graph-databases-2021-11.pdf new file mode 100644 index 00000000..eac7b460 Binary files /dev/null and b/docs/presentations/graph-databases-2021-11.pdf differ diff --git a/docs/presentations/ldbc-snb-2021-12.pdf b/docs/presentations/ldbc-snb-2021-12.pdf new file mode 100644 index 00000000..9cbc43e5 Binary files /dev/null and b/docs/presentations/ldbc-snb-2021-12.pdf differ diff --git a/docs/presentations/ldbc-snb-2022-11.pdf b/docs/presentations/ldbc-snb-2022-11.pdf new file mode 100644 index 00000000..9aec9e54 Binary files /dev/null and b/docs/presentations/ldbc-snb-2022-11.pdf differ diff --git a/docs/presentations/tpctc-2023-ldbc-linked-data-benchmark-council-organization.pdf b/docs/presentations/tpctc-2023-ldbc-linked-data-benchmark-council-organization.pdf new file mode 100644 index 00000000..82daeb91 Binary files /dev/null and b/docs/presentations/tpctc-2023-ldbc-linked-data-benchmark-council-organization.pdf differ diff --git a/docs/presentations/tpctc-2023-ldbc-snb-interactive-v2.pdf b/docs/presentations/tpctc-2023-ldbc-snb-interactive-v2.pdf new file mode 100644 index 00000000..7a1d7d5e Binary files /dev/null and b/docs/presentations/tpctc-2023-ldbc-snb-interactive-v2.pdf differ diff --git a/docs/presentations/vldb-2023-ldbc-snb-bi-slides-szarnyasg.pdf b/docs/presentations/vldb-2023-ldbc-snb-bi-slides-szarnyasg.pdf new file mode 100644 index 00000000..90f1df3b Binary files /dev/null and b/docs/presentations/vldb-2023-ldbc-snb-bi-slides-szarnyasg.pdf differ diff --git a/event/eighth-tuc-meeting/index.html b/event/eighth-tuc-meeting/index.html new file mode 100644 index 00000000..d8936196 --- /dev/null +++ b/event/eighth-tuc-meeting/index.html @@ -0,0 +1,575 @@ + + + + + Eighth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Eighth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Damaris Coll / on 22 Jun 2016
+ + + +
+ Location: Oracle Conference Center in Redwood Shores (CA) US +
+
+ + Event dates: 22 Jun 2016 14:45 -- 23 Jun 2016 14:45 (local timezone) + +
+
+

The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.

+

This will be a two-day event/eighth-tuc-meeting/attachments at Oracle Conference Center in Redwood Shores facility on Wednesday and Thursday June 22-23, 2016.

+

This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:

+
    +
  • Two day event/eighth-tuc-meeting/attachments with one day devoted to User’s experiences and one day devoted to benchmarking experiences.
  • +
  • Presentation of the benchmarking results for the different benchmarks.
  • +
  • Interaction with the new LDBC Board of Directors and the LDBC organisation officials.
  • +
+

We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu; in order to notify Oracle security in advance, registration requests need to be in by June 12.

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.

+

In this page, you’ll find information about the following items:

+ +

Agenda

+

On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.

+
Wednesday, 22th of June 2016 (Room 203)
+

(full morning: LDBC Board of Directors meeting)

+ +
Thursday, 23th of June 2016 (Room 203)
+ +
Friday, 24th of June 2016 (Room 105)
+

At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (GRADES16).

+

18:30 social dinner for GRADES registrants (place to be announced)

+

Logistics

+
Date
+

22nd and 23rd June 2016

+
Venue
+

The TUC meeting will be held in the Oracle Conference Center

+

The address is:

+

Room 203 (Wed-Thu) & Room 105 (Fri)
+Oracle Conference Center
+350 Oracle Parkway
+Redwood City, CA 94065, USA

+

Maps and situation

+

Google Maps link

+

Oracle Campus map:

+

+
Getting there
+
Driving directions
+
    +
  • [Southbound] - Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.
  • +
  • [Northbound] - Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.
  • +
+
Parking
+

The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.

+
Public transport
+

Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.

+ +

You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.

+

Alternatively, SamTrans (San Mateo County’s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/eleventh-tuc-meeting/index.html b/event/eleventh-tuc-meeting/index.html new file mode 100644 index 00000000..c1a3b0cd --- /dev/null +++ b/event/eleventh-tuc-meeting/index.html @@ -0,0 +1,527 @@ + + + + + Eleventh TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Eleventh TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Peter Boncz / on 08 Jun 2018
+ + + +
+ Location: Austin, TX +
+
+ + Event date: 08 Jun 2018 08:30 (local timezone) + +
+
+

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.

+

This TUC meeting will be a one-day event preceding the SIGMOD/PODS 2018 conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called GRADES-NDA 2018 as well, so you might combine travel.

+

We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu to register.

+

=> registration is free, but required <=

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (boncz@cwi.nl) and Larri (larri@ac.upc.edu). Local organizer is Juan Sequeda (juanfederico@gmail.com).

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.

+

Agenda

+

In the TUC meeting there will be:

+
    +
  • updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.
  • +
  • talks by data management practitioners highlighting graph data management challenges and products
  • +
+

The meeting will start on Friday morning, with a program from 10:30-17:00:

+ +

Location

+

The TUC will be held at the University of Texas at Austin, Department of Computer Science in the Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712 Room: GDC 6.302

+

The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.

+

From Austin to SIGMOD/PODS (Houston) on Saturday June 9

+

Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.

+

Bus

+

One option is to take a MegaBus that departs from downtown Austin and arrives at downtown Houston.

+

There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).

+

If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/fifteenth-tuc-meeting/index.html b/event/fifteenth-tuc-meeting/index.html new file mode 100644 index 00000000..d01d740d --- /dev/null +++ b/event/fifteenth-tuc-meeting/index.html @@ -0,0 +1,660 @@ + + + + + Fifteenth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Fifteenth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Gábor Szárnyas / on 05 Apr 2022
+ + + +
+ Location: Philadelphia, Marriott Philadelphia Downtown +
+
+ + Event dates: 17 Jun 2022 09:20 -- 18 Jun 2022 15:30 (local timezone) + +
+
+

Organizers: Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2022 on June 17-18 (Friday-Saturday).

+

The program consists of 10-15 minute talks followed by a Q&A session. The talks will be recorded and made available online.
+The tenative program is the following. All times are in EDT.

+

We will have a social event on Friday at 17:30 at El Vez (Google Maps).

+

Friday (Pennsylvania Convention Center, room 204B)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startfinishspeakertitle
09:2009:30Peter Boncz (LDBC/CWI)State of the union – slides, video
09:3009:45Alastair Green (LDBC/Birkbeck)LDBC’s fair use policies – slides, video
09:5010:05Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)LDBC Social Network Benchmark: Business Intelligence workload v1.0 – slides, video
10:1010:25Heng Lin (Ant Group)LDBC Financial Benchmark introduction – slides, video
10:3011:00coffee break
11:0011:15Chen Zhang (CreateLink)New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – slides, video
11:2011:35James Clarkson (Neo4j)LDBC benchmarks: Promoting good science and industrial consumption – slides, video
11:4011:55Oskar van Rest (Oracle)Creating and querying property graphs in Oracle, on-premise and in the cloud – slides, video
12:0012:15Mingxi Wu (TigerGraph)Conquering LDBC SNB BI at SF-10k – slides, video
12:2013:20lunch (on your own)
13:2013:35Altan Birler (Technische Universität München)Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – slides, video
13:4013:55David Püroja (CWI)LDBC Social Network Benchmark: Interactive workload v2.0 – slides
14:0014:15Angela Bonifati (Lyon 1 University)The quest for schemas in graph databases – slides, video
14:2014:35Matteo Lissandrini (Aalborg University)Understanding graph data representations in triplestores – slides, video
14:4014:55Wim Martens (University of Bayreuth)Path representations – slides, video
15:0015:20Audrey Cheng (UC Berkeley)TAOBench: An end-to-end benchmark for social network workloads – slides, video
+

Saturday (Philadelphia Marriott Downtown, room 401-402, 4th floor)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startfinishspeakertitle
10:0010:15Keith Hare (WG3)An update on the GQL & SQL/PGQ standards efforts – slides, video
10:2010:35Leonid Libkin (ENS Paris)Pattern matching in GQL and SQL/PGQ – slides, video
10:4010:55Petra Selmer (Neo4j/WG3)An overview of GQL – slides, video
11:0011:15Alastair Green (LDBC/WG3)GQL 2.0: A technical manifesto – slides, video
11:2011:35George Fletcher (TU Eindhoven)PG-Keys (LDBC Property Graph Schema Working Group) – slides, video
11:4011:55Arvind Shyamsundar (Microsoft)Graph capabilities in Microsoft SQL Server and Azure SQL Database – slides, video
12:0013:30lunch (on your own)
13:3013:45Daniël ten Wolde (CWI)Implementing SQL/PGQ in DuckDB – slides, video
13:5014:05Oszkár Semeráth, Kristóf Marussy (TU Budapest)Generation techniques for consistent, realistic, diverse, and scalable graphs – slides, video
14:1014:25Molham Aref (RelationalAI)Graph Normal Form – slides, video
14:3014:45Naomi Arnold (Queen Mary University of London)Temporal graph analysis of the far-right social network Gab – slides, video
14:5015:05Domagoj Vrgoč (PUC Chile)Evaluating path queries in MillenniumDB – slides, video
15:1015:25Pavel Klinov, Evren Sirin (Stardog)Stardog’s experience with LDBC – slides, video
+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/fifth-tuc-meeting/index.html b/event/fifth-tuc-meeting/index.html new file mode 100644 index 00000000..e72d45c1 --- /dev/null +++ b/event/fifth-tuc-meeting/index.html @@ -0,0 +1,491 @@ + + + + + Fifth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Fifth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Irini Fundulaki / on 14 Nov 2014
+ + + +
+ Location: Athens, Greece +
+
+ + Event date: 14 Nov 2014 12:32 (local timezone) + +
+
+

The LDBC consortium are pleased to announce its fifth Technical User
+Community (TUC) meeting.

+

This will be a one-day event at the National Hellenic Research Institute
+in Athens, Greece on Friday November 14, 2014.

+

Agenda

+

10:30 - 11:00 Coffee Break

+

11:00 - 11:10 Peter Boncz (VUA) Welcome & LDBC project status update (Presentation)

+

11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status

+

Feedback & Roadmap for SPB & OWLIM (Presentation)

+

11:25 - 11:30 Orri Erling (OGL) Status, Feedback & Roadmap for SPB & Virtuoso (Presentation)

+

11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback & Roadmap for SNB & Neo4J (Presentation)

+

11:45 - 12:00 Orri Erling (OGL) Status, Feedback & Roadmap for SNB & Virtuoso (Presentation)

+

12:00 - 12:20 Arnau Prat (UPC) & Andrey Gubichev Status, Feedback & Roadmap for SNB Interactive & Sparksee (Presentation ) and Business Intelligence (Presentation)

+

12:20 - 12:40 Tomer Sagi, “Experience with SNB and TitanDB at HP” (Presentation )

+

12:40 - 13:00 Jakob Nelson, “graphbench.org on the SNB datagen”

+

13:00 - 14:30 Lunch Break@Byzantine & Christian Museum (link)

+

14:30 - 14:50 Olaf Hartig, “Integrating the Property Graph and RDF data models” (Presentation)\

+

Documents: arxiv/1409.3288, arxiv/1406.3399

+

14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, “Challenges to be addressed during Benchmarking SPARQL Federated Engines” (Presentation)

+

15:10 - 15:30 Evaggelia Pitoura, “Historical Queries on Graphs” (Presentation)

+

15:30 - 16:00 Coffee Break

+

16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, “Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases” (Presentation)

+

16:20 - 16:40 Gunes Aluc, “WatDiv: How to Tune-up your RDF Data Management System” (Presentation)

+

16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, “Benchmarking @LogicBlox” (Presentation)

+

17:00 - 17:15 Hassan Chafi, “Oracle Labs Graph Strategy”

+

17:15 - 17:25 Yinglong Xia, “Property Graphs for Industry Solution at IBM” (Presentation)

+

17:25 - 17:30 Arthur Keen, “Short Introduction to SPARQLcity”

+

20:30 Dinner @ Konservokouti (link)

+

Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion

+

Logistics

+

The meeting will be held at the National Hellenic Research Foundation located in downtown Athens.

+

+

Travel

+

Athens, Greece’s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.

+

To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).

+

You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: http://www.aia.gr/traveler/

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/first-tuc-meeting/index.html b/event/first-tuc-meeting/index.html new file mode 100644 index 00000000..68c6810b --- /dev/null +++ b/event/first-tuc-meeting/index.html @@ -0,0 +1,546 @@ + + + + + First TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

First TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Barry Bishop / on 22 Nov 2012
+ + + +
+ Location: Barcelona, Spain +
+
+ + Event dates: 19 Nov 2012 09:00 -- 20 Nov 2012 17:00 (local timezone) + +
+
+

The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the 19/20th November 2012.

+

So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:

+
    +
  • Introduction by the coordinator and technical director explaining the objectives of the LDBC project
  • +
  • Invitation to users to explain their use-cases and describe the limitations they have found in current technology
  • +
  • Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points
  • +
+

The exact agenda will be published here as things get finalised before the event.

+

All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu

+ +

Agenda

+

We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.

+

Day 1

+

09:00 Welcome (Location: Aula Master)
+09:30 Project overview (Emphasis on task forces?) + Questionnaire results?
+10:30 Coffee break
+11:00 User talks (To gather information for use cases?)

+

13:00 Lunch

+

14:00 User talks (cont.)
+15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).
+16:00 Task force proposals (consortium)
+17:00 Finish first day

+

20:00 Social dinner

+

Day 2

+

10:00 Task force discussion (consortium + TUC)
+11:00 Coffe break
+11:30 Task force discussion (consortium + TUC)
+12:30 Summaries (Task forces, use cases, …) and actions

+

13:00 Lunch and farewell

+

15:00 LDBC Internal meeting

+

Slide

+

Opening session:

+ +

User stories:

+ +

Benchmark proposals:

+ +

Logistics

+
Date
+

19th and 20th November 2012

+
Location
+

The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:

+

Aula Master
+Edifici A3, Campus Nord UPC
+C. Jordi Girona, 1-3
+08034 Barcelona, Spain

+

Venue

+

To reach the campus, there are several options, including Taxi, Metro and Bus.

+

+

Finding UPC

+

+

Finding the meeting room

+

Getting there

+

Flying: Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this map of the airport). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.

+

Rail: The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.

+

Bus: The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.

+

Taxi: From the airport, you can take one of Barcelona’s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.

+

Train and bus: Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: http://www.barcelona-airport.com/eng/transport_eng.htm

+

+

The locations of the airport and the city centre

+

+

Bus map

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/fourteenth-tuc-meeting/index.html b/event/fourteenth-tuc-meeting/index.html new file mode 100644 index 00000000..b6e01c38 --- /dev/null +++ b/event/fourteenth-tuc-meeting/index.html @@ -0,0 +1,591 @@ + + + + + Fourteenth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Fourteenth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Gábor Szárnyas / on 17 Jun 2021
+ + + +
+ Location: Copenhagen, Tivoli Hotel, room Akvariet 2 +
+
+ + Event dates: 16 Aug 2021 16:00 -- 16 Aug 2021 20:00 (local timezone) + +
+
+

LDBC was hosting a one-day hybrid workshop, co-located with VLDB 2021 on August 16 (Monday) between 16:00–20:00 CEST.

+

The physical part of the workshop was held in room Akvariet 2 of the Tivoli Hotel (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC’s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.

+

Talks were scheduled to be 10 minutes with a short Q&A session. We had three sessions. Their schedules are shown below.

+

[16:00–17:25 CEST] LDBC updates, benchmarks, query languages

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startspeakertitle
16:00Peter Boncz (CWI)State of the union – slides
16:05Gábor Szárnyas (CWI)Overview of LDBC benchmarks – slides
16:12Mingxi Wu (TigerGraph)LDBC Social Network Benchmark results with TigerGraph – slides
16:24Xiaowei Zhu (Ant Group)Financial Benchmark proposal – slides
16:36Petra Selmer (Neo4j)Status report from the Existing Languages Working Group (ELWG) – slides, video
16:48Jan Hidders (Birkbeck)Status report from the Property Graph Schema Working Group (PGSWG) – slides, video
17:00Keith Hare (JCC Consulting)Database Language Standards Structure and Process, SQL/PGQ – slides, video
17:12Stefan Plantikow (GQL Editor)Report on the GQL standard – slides, video
+

coffee break (10 minutes)

+

[17:35–18:45 CEST] Systems and data structures

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startspeakertitle
17:35Vasileios Trigonakis (Oracle Labs)PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – slides, video
17:47Matthias Hauck (SAP)JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – slides, video
17:59Nikolay Yakovets (Eindhoven University of Technology)AvantGraph – slides, video
18:11Semih Salihoglu (University of Waterloo)GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – slides, video
18:23Semyon Grigorev (Saint Petersburg University)Context-free path querying: Obstacles on the way to adoption – slides, video
18:35Per Fuchs (Technical University of Munich)Sortledton: A universal, transactional graph data structure – slides, video
+

coffee break (10 minutes)

+

[18:55-20:00 CEST] High-level approaches and benchmarks

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startspeakertitle
18:55Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – slides, video
19:07Vasia Kalavri (Boston University)Learning to partition unbounded graph streams – slides, video
19:19Muhammad Attahir Jibril (TU Ilmenau)Towards a Hybrid OLTP-OLAP Graph Benchmark – slides, video
19:31Riccardo Tommasini (University of Tartu)An outlook on Benchmarks for Graph Stream Processing – slides, video
19:43Mohamed Ragab (University of Tartu)Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – slides, video
+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/fourth-tuc-meeting/index.html b/event/fourth-tuc-meeting/index.html new file mode 100644 index 00000000..d24217ce --- /dev/null +++ b/event/fourth-tuc-meeting/index.html @@ -0,0 +1,558 @@ + + + + + Fourth TUC meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Fourth TUC meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Peter Boncz / on 03 Apr 2014
+ + + +
+ Location: Amsterdam, netherlands +
+
+ + Event date: 03 Apr 2014 12:32 (local timezone) + +
+
+

The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.

+

This will be a one-day event at CWI in Amsterdam on Thursday April 3, 2014.

+

The event will include:

+
    +
  • Introduction to the objectives and progress of the LDBC project.
  • +
  • Description of the progress of the benchmarks being evolved through Task Forces.
  • +
  • Users explaining their use-cases and describing the limitations they have found in current technology.
  • +
  • Industry discussions on the contents of the benchmarks.
  • +
+

All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu

+

For presenters please limit your talks to just 15 minutes

+

Agenda

+

April 3rd

+
    +
  • +

    10:00 Peter Boncz (VUA) – pptx, video: LDBC project status update

    +
  • +
  • +

    10:20 Norbert Martinez (UPC) – pdf, video: Status update on the LDBC Social Network Benchmark (SNB) task force.

    +
  • +
  • +

    10:50 Alexandru Iosup (TU Delft) – ppt, video: Towards Benchmarking Graph-Processing Platforms

    +
  • +
  • +

    11:10 Mike Bryant (Kings College) – pptx, video: EHRI Project: Archival Integration with Neo4j

    +
  • +
+

11:30 coffee

+
    +
  • +

    11:50 Thilo Muth (University of Magdeburg) – pptx, video: MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis

    +
  • +
  • +

    12:10 Davy Suvee (Janssen Pharmaceutica / Johnson & Johnson) – video: Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph

    +
  • +
  • +

    12:30 Yongming Luo (TU Eindhoven) – pdf, video: Regularities and dynamics in bisimulation reductions of big graphs

    +
  • +
  • +

    12:50 Christopher Davis (TU Delft) – pdf, video: Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues

    +
  • +
+

13:10 - 14:30 lunch @ restaurant Polder

+
    +
  • +

    14:30 SPB task force report

    +
  • +
  • +

    15:00 Bastiaan Bijl (Sysunite) – pdf, video: Using a semantic approach for monitoring applications in large engineering projects

    +
  • +
  • +

    15:20 Frans Knibbe (Geodan) – pptx, video: Benchmarks for geographical data

    +
  • +
  • +

    15:40 Armando Stellato (University of Rome, Tor Vergata & UN Food and Agriculture Organization) – pptx, video: VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges

    +
  • +
+

16:00 coffee

+
    +
  • +

    16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), video:Customer experiences in implementing SKOS-based vocabularymanagement systems

    +
  • +
  • +

    16:40 Simon Jupp (European Bioinformatics Institute) – pdf, video: [Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]

    +
  • +
  • +

    17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – pdf, video: Breakmarking UniProt RDF. SPARQL queries that make your database cry…

    +
  • +
  • +

    17:20 Rein van ’t Veer (Digital Heritage Netherlands) – pptx, video Time and space for heritage

    +
  • +
  • +

    17:40 end of meeting

    +
  • +
  • +

    19:00 - 21:30 Social Dinner in restaurant Boom

    +
  • +
+

April 4th

+

LDBC plenary meeting for project partners.

+ +

Logistics

+

The meeting will be held at the Dutch national research institute for computer science and mathematics (CWI - Centrum voor Wiskunde en Informatica). It is located at Amsterdam Science Park:

+

+

(A5 map)

+
Travel
+

Arriving & departing:

+

Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, www.schiphol.nl) that serves all main European carriers and also very many low-fare carriers.

+

http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane

+

Trains (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) – which station you are also likely arriving at in case of an international train trip.

+

From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 – it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).

+

Taxi is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).

+

Public transportation (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.

+

Only the “disposable” cards are interesting for you as visitor.

+

Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.

+

Getting Around: the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.

+

Cars

+

In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the “WCW” terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.

+

Arriving at CWI: Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you’ll receive a visitor’s pass that allows you to enter our building.

+

Social Dinner

+

The social dinner will take place at 7pm on April 3 in Restaurant Boom (boometenendrinken.nl), Linneausstraat 63, Amsterdam.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/index.html b/event/index.html new file mode 100644 index 00000000..34163de0 --- /dev/null +++ b/event/index.html @@ -0,0 +1,641 @@ + + + + + Events + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Events

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Sixteenth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

Organizers: Oskar van Rest, Alastair Green, Gábor Szárnyas

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2023 on June 23-24 (Friday-Saturday).

+

The program consists of 10- and 15-minute talks followed by a Q&A session. The talks will be recorded and made available online. If you would like to participate please register using our form.

+

LDBC will host a social event on Friday at the Black Bottle gastrotavern in Belltown: …

+ +
+
+ +
+ + +
+
+
+ +

Fifteenth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

Organizers: Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2022 on June 17-18 (Friday-Saturday).

+

The program consists of 10-15 minute talks followed by a Q&A session. The talks will be recorded and made available online.
+The tenative program is the following. All times are in EDT.

+

We will have a social event on Friday at 17:30 at El Vez (Google Maps).

+

Friday ( …

+ +
+
+ +
+ + +
+
+
+ +

Fourteenth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

LDBC was hosting a one-day hybrid workshop, co-located with VLDB 2021 on August 16 (Monday) between 16:00–20:00 CEST.

+

The physical part of the workshop was held in room Akvariet 2 of the Tivoli Hotel (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC’s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest …

+ +
+
+ +
+ + +
+
+
+ +

Thirteenth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.

+

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph …

+ +
+
+ +
+ + +
+
+
+ +

Twelfth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.

+

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, …

+ +
+
+ +
+ + +
+
+
+ +

Eleventh TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.

+

This TUC meeting will be a one-day event preceding the SIGMOD/PODS …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/index.xml b/event/index.xml new file mode 100644 index 00000000..05ed75af --- /dev/null +++ b/event/index.xml @@ -0,0 +1,5298 @@ + + + + Events on Linked Data Benchmark Council + https://ldbcouncil.org/event/ + Recent content in Events on Linked Data Benchmark Council + Hugo -- gohugo.io + en-us + &copy; Copyright LDBC 2024 + Fri, 23 Jun 2023 09:00:00 -0800 + + Announcing the Official Release of LDBC Financial Benchmark v0.1.0 + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + Tue, 27 Jun 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + <p>We are delighted to announce the official release of the initial version (v0.1.0) of <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench)</a>.</p> +<p>The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">LDBC FinBench Task Force</a>. The benchmark has one workload currently, <strong>Transaction Workload</strong>, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.</p> +<p>Compared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the <a href="https://ldbcouncil.org/benchmarks/finbench/finbench-talk-16th-tuc.pdf">slides</a> in the 16th TUC. The <a href="https://arxiv.org/pdf/2306.15975.pdf">Financial Benchmark&rsquo;s specification</a> can be found on arXiv.</p> +<p>The release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.</p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.</p> + + + + + Sixteenth TUC Meeting + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + Fri, 23 Jun 2023 09:00:00 -0800 + + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Oskar van Rest, Alastair Green, Gábor Szárnyas</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2023.sigmod.org/venue.shtml">SIGMOD 2023</a> on <strong>June 23-24 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10- and 15-minute talks followed by a Q&amp;A session. The talks will be recorded and made available online. <strong>If you would like to participate please register using <a href="https://forms.gle/T6bwVHzK9V5FaKyR9">our form</a>.</strong></p> +<p>LDBC will host a <strong>social event</strong> on Friday at the <a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a>.</p> +<p>In addition, AWS will host a <strong>Happy Hour</strong> (rooftop grill with beverages) on Saturday on the Amazon Nitro South building&rsquo;s 8th floor deck: <a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>.</p> +<h3 id="program">Program</h3> +<p><strong>All times are in PDT.</strong></p> +<h4 id="friday">Friday</h4> +<p><strong>Location:</strong> Hyatt Regency Bellevue on Seattle&rsquo;s Eastside, <strong>room Grand K</strong>, co-located with SIGMOD (<a href="https://www.hyatt.com/en-US/hotel/washington/hyatt-regency-bellevue-on-seattles-eastside/belle">900 Bellevue Way NE, Bellevue, WA 98004-4272</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>08:30</td> +<td>08:45</td> +<td>Oskar van Rest (Oracle)</td> +<td>LDBC – State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-ldbc-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/Frk7ITssaSY">video</a></td> +</tr> +<tr> +<td>08:50</td> +<td>09:05</td> +<td>Keith Hare (JCC / WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/keith-hare-an-update-on-the-gql-and-sql-pgq-standards-efforts.pdf">slides</a>, <a href="https://youtu.be/LQYkal_0j6E">video</a></td> +</tr> +<tr> +<td>09:10</td> +<td>09:25</td> +<td>Stefan Plantikow (Neo4j / WG3)</td> +<td>GQL - Introduction to a new query language standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/stefan-plantikow-gql-v1.pdf">slides</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Leonid Libkin (University of Edinburgh &amp; RelationalAI)</td> +<td>Formalizing GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/leonid-libkin-formalizing-gql.pdf">slides</a>, <a href="https://youtu.be/YZE1a00h1I4">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Semen Panenkov (JetBrains Research)</td> +<td>Mechanizing the GQL semantics in Coq – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/semyon-panenkov-gql-in-coq.pdf">slides</a>, <a href="https://youtu.be/5xBGohqWCzo">videos</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Oskar van Rest (Oracle)</td> +<td>SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-sql-property-graphs-in-oracle-database-and-oracle-graph-server-pgx.pdf">slides</a>, <a href="https://youtu.be/owM9WiQubpg">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (JCC)</td> +<td>LDBC&rsquo;s organizational changes and fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-ldbc-corporate-restructuring-and-fair-use-policies.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>Ioana Manolescu (INRIA)</td> +<td>Integrating Connection Search in Graph Queries – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ioana-manolescu-integrating-connection-search-in-graph-queries.pdf">slides</a>, <a href="https://youtu.be/LQPnmcrkUpY">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Maciej Besta (ETH Zurich)</td> +<td>Neural Graph Databases with Graph Neural Networks – <a href="https://youtu.be/ce5qNievRNs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:10</td> +<td>Longbin Lai (Alibaba Damo Academy)</td> +<td>To Revisit Benchmarking Graph Analytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/longbin-lai-benchmark-ldbc.pdf">slides</a>, <a href="https://youtu.be/s9Vtt-6t_FI">video</a></td> +</tr> +<tr> +<td>12:15</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>The World of Graph Databases from An Industry Perspective – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/yuanyuan-tian-world-of-graph-databases.pdf">slides</a>, <a href="https://youtu.be/AZuP_b95GPM">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Alin Deutsch (UC San Diego &amp; TigerGraph)</td> +<td>TigerGraph&rsquo;s Parallel Computation Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alin-deutsch-tigergraphs-computation-model.pdf">slides</a>, <a href="https://youtu.be/vcxdieJB80Y">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Chen Zhang (CreateLink)</td> +<td>Applications of a Native Distributed Graph Database in the Financial Industry – <a href="https://youtu.be/GCCT79Sps9I">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Ricky Sun (Ultipa)</td> +<td>Design of highly scalable graph database systems – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ricky-sun-ultipa.pdf">slides</a>, <a href="https://youtu.be/Sg1F64O4vGM">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Heng Lin (Ant Group)</td> +<td>The LDBC SNB implementation in TuGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-the-ldbc-snb-implementation-in-tugraph.pdf">slides</a>, <a href="https://youtu.be/fy8AuVerwnY">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>Shipeng Qi (Ant Group)</td> +<td>FinBench: The new LDBC benchmark targeting financial scenario – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/shipeng-qi-finbench.pdf">slides</a>, <a href="https://youtu.be/0xLZadDOfZk">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>17:00</td> +<td>host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>FinBench panel – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-finbench-panel.pdf">slides</a></td> +</tr> +<tr> +<td>19:00</td> +<td>22:00</td> +<td><em>dinner</em></td> +<td><em><a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a></em></td> +</tr> +</tbody> +</table> +<h4 id="saturday">Saturday</h4> +<p><strong>Location:</strong> Amazon Nitro South building, <strong>room 03.204</strong> (<a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:00</td> +<td>09:45</td> +<td>Brad Bebee (AWS)</td> +<td>Customers don&rsquo;t want a graph database, so why are we still here? – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/brad-bebee-tuc-keynote.pdf">slides</a>, <a href="https://youtu.be/bJlkpDC--fM">video</a></td> +</tr> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Fast and Efficient Update Handling for Graph H2TAP – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/muhammad-attahir-jibril-fast-and-efficient-update-handling-for-graph-h2tap.pdf">slides</a>, <a href="https://youtu.be/e8ZAszBsXV0">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Gabor Szarnyas (CWI)</td> +<td>LDBC Social Network Benchmark and Graphalytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-social-network-benchmark-and-graphalytics.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:30</td> +<td>Atanas Kiryakov and Tomas Kovachev (Ontotext)</td> +<td>GraphDB – Benchmarking against LDBC SNB &amp; SPB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tomas-kovatchev-atanas-kiryakov-benchmarking-graphdb-with-snb-and-spb.pdf">slides</a>, <a href="https://youtu.be/U6OPpNFOWqg">video</a></td> +</tr> +<tr> +<td>11:35</td> +<td>11:50</td> +<td>Roi Lipman (Redis Labs)</td> +<td>Delta sparse matrices within RedisGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/roi-lipman-delta-matrix.pdf">slides</a>, <a href="https://youtu.be/qfKsplV4Ihk">video</a></td> +</tr> +<tr> +<td>11:55</td> +<td>12:05</td> +<td>Rathijit Sen (Microsoft)</td> +<td>Microarchitectural Analysis of Graph BI Queries on RDBMS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/rathijit-sen-microarchitectural-analysis.pdf">slides</a>, <a href="https://youtu.be/55B8CkH09js">video</a></td> +</tr> +<tr> +<td>12:10</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td><em>on your own</em></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Alastair Green (JCC)</td> +<td>LEX &ndash; LDBC Extended GQL Schema – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-lex.pdf">slides</a>, <a href="https://youtu.be/DVpeb4Ce9Uw">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Ora Lassila (AWS)</td> +<td>Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ora-lassila-why-limit-yourself-to-lpg-when-you-can-do-rdf-too.pdf">slides</a>, <a href="https://youtu.be/7uAInoUwdds">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Jan Hidders (Birkbeck, University of London)</td> +<td>PG-Schema: a proposal for a schema language for property graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/jan-hidders-pg-schema.pdf">slides</a>, <a href="https://youtu.be/yQNL8hBTE4M">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Max de Marzi (RageDB and RelationalAI)</td> +<td>RageDB: Building a Graph Database in Anger – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/max-de-marzi-ragedb-building-a-graph-database-in-anger.pdf">slides</a>, <a href="https://youtu.be/LBbF8aslYFE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Umit Catalyurek (AWS)</td> +<td>HPC Graph Analytics on the OneGraph Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/umit-catalyurek-onegraph-hpc.pdf">slides</a>, <a href="https://youtu.be/64tv5LA6Wr8">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>David J. Haglin (Trovares)</td> +<td>How LDBC impacts Trovares – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/david-haglin-trovares.pdf">slides</a>, <a href="">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>16:25</td> +<td>Wenyuan Yu (Alibaba Damo Academy)</td> +<td>GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/wenyuan-yu-graphscope-flex.pdf">slides</a>, <a href="https://youtu.be/cRikoyDmMks">video</a></td> +</tr> +<tr> +<td>16:30</td> +<td>16:40</td> +<td>Scott McMillan (Carnegie Mellon University)</td> +<td>Graph processing using GraphBLAS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/scott-mcmillan-graph-processing-using-graphblas.pdf">slides</a>, <a href="https://youtu.be/yb4hGBhUzQQ">video</a></td> +</tr> +<tr> +<td>16:45</td> +<td>16:55</td> +<td>Tim Mattson (Intel)</td> +<td>Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tim-mattson-graphblas-and-tiledb.pdf">slides</a></td> +</tr> +<tr> +<td>17:00</td> +<td>20:00</td> +<td><em>happy hour (rooftop grill with beverages)</em></td> +<td><em>on the Nitro South building&rsquo;s 8th floor deck</em></td> +</tr> +</tbody> +</table> +<h4 id="tuc-event-locations">TUC event locations</h4> +<p>A <a href="https://www.google.com/maps/d/u/0/edit?mid=19_fi4fV-3-PZkNWCCcmhU86ct2EZXbgo">map of the LDBC TUC events</a> we hosted so far.</p> + + + + + LDBC SNB – Early 2023 updates + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + Wed, 15 Feb 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + <p>2023 has been an eventful year for us so far. Here is a summary of our recent activities.</p> +<ol> +<li> +<p>Our paper <a href="https://ldbcouncil.org/docs/papers/ldbc-snb-bi-vldb-2022.pdf">The LDBC Social Network Benchmark: Business Intelligence Workload</a> was published in PVLDB.</p> +</li> +<li> +<p>David Püroja just completed his MSc thesis on creating a design towards <a href="https://ldbcouncil.org/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf">SNB Interactive v2</a> at CWI&rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference&rsquo;s graph developer room titled <a href="https://fosdem.org/2023/schedule/event/graph_ldbc/">The LDBC Social Network Benchmark</a> (<a href="https://www.youtube.com/watch?v=YNF6z6gtXY4">YouTube mirror</a>).</p> +</li> +<li> +<p>I gave a lightning talk at FOSDEM&rsquo;s HPC developer room titled <a href="https://www.youtube.com/watch?v=q26DHnQFw54">The LDBC Benchmark Suite</a> (<a href="https://www.youtube.com/watch?v=q26DHnQFw54">YouTube mirror</a>).</p> +</li> +<li> +<p>Our auditors have successfully benchmark a number of systems:</p> +<ul> +<li>SPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze)</li> +<li>SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja)</li> +<li>SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr)</li> +</ul> +</li> +</ol> +<p>The results and the full disclosure reports are available under the <a href="https://ldbcouncil.org/benchmarks/spb/">SPB</a> and <a href="https://ldbcouncil.org/benchmarks/snb/">SNB benchmark pages</a>.</p> + + + + + LDBC SNB Datagen – The winding path to SF100K + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + Tue, 13 Sep 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + <p>LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">last technical update</a> on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark&rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.</p> +<h1 id="moving-to-sparksql">Moving to SparkSQL</h1> +<p>We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL&rsquo;s capabilites.</p> +<blockquote> +<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.</p> +</blockquote> +<p>Dealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup>, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators&rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:</p> +<ol> +<li>table manipulations related to shaping the output into the supported layouts and data types as set forth in the specification;</li> +<li>deriving the Interactive and BI datasets;</li> +<li>and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes.</li> +</ol> +<p>We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.</p> +<p><img src="datagen_df_0.png" alt="Datagen stages"></p> +<p>The diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there&rsquo;s no simple way to avoid<br> +it, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.</p> +<p>I&rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">previous blogpost in the series</a> or the <a href="https://arxiv.org/abs/2001.02299">Interactive benchmark specification</a>.</p> +<h1 id="transformation-pipeline">Transformation pipeline</h1> +<p>The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:</p> +<ul> +<li>explodes edges and / or attributes into separate tables,</li> +<li>subsets the snapshot part and creates insert / delete batches for the BI workload,</li> +<li>subsets the snapshot part for the Interactive workload,</li> +<li>applies formatting related options such as date time representation,</li> +<li>serializes the data to a Spark supported format (CSV, Parquet),</li> +</ul> +<p>We utilize a flexible data pipeline that operates on the graph.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span>, <span style="color:#66d9ef">M2</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">]</span> <span style="color:#a6e22e">extends</span> <span style="color:#f92672">(</span><span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">])</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">In</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> apply<span style="color:#f92672">(</span>v<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">])</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>v<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>The <code>Transform</code> trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let&rsquo;s see some of the transformations we have.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToBiTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">BI</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> keepImplicitDeletes<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.BI</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeEdges</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Therefore, a transformation pipeline may look like this:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>params<span style="color:#f92672">,</span> start<span style="color:#f92672">,</span> end<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>inputGraph<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>The <code>Graph</code> record has a <code>definition</code> field containing graph-global metadata, whereas <code>entities</code> holds the datasets keyed by their entity type. There are 3 graph <em>modes</em> currently: <code>Raw</code>, <code>Interactive</code> and <code>BI</code>. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the <code>Layout</code> dependent type, over which the entities are polymorphic.</p> +<p>It&rsquo;s important to understand that <code>Graph</code> holds <code>DataFrame</code>s, and these are lazily computed by Spark. So, <code>Graph</code> is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> isAttrExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> isEdgesExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> useTimestamp<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">M</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">Option</span><span style="color:#f92672">[</span><span style="color:#66d9ef">String</span><span style="color:#f92672">]]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> definition<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M</span><span style="color:#f92672">],</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">M</span><span style="color:#66d9ef">#</span><span style="color:#66d9ef">Layout</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">sealed</span> <span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Raw</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Interactive</span><span style="color:#f92672">(</span>bulkLoadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">BI</span><span style="color:#f92672">(</span>bulkloadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">,</span> batchPeriod<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">String</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">BatchedEntity</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>You may notice that <code>Transform</code> is statically typed w.r.t. <code>Mode</code>, however other properties, like <code>isAttrExploded</code>, or <code>isEdgesExploded</code> are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.</p> +<p>As we already mentioned, <code>Graph</code> is essentially a persistent container of <code>EntityType -&gt; DataFrame</code> mappings. <code>EntityType</code> can be <code>Node</code>, <code>Edge</code> and <code>Attr</code>, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.</p> +<p>Usually, a graph transformation involves matching entities based on their <code>EntityType</code>, and modifying the mapping (and if required, other metadata). Take, for example, the <code>ExplodeAttrs</code> transformation, which explodes into separate tables the values of two columns of <code>Person</code> stored as arrays:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">if</span> <span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#75715e">// assert at runtime that the transformation hasn&#39;t been applied yet +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#66d9ef">throw</span> <span style="color:#66d9ef">new</span> <span style="color:#a6e22e">AssertionError</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Attributes already exploded in the input graph&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> explodedAttr<span style="color:#f92672">(</span>attr<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Attr</span><span style="color:#f92672">,</span> node<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">DataFrame</span><span style="color:#f92672">,</span> column<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Column</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">=</span> +</span></span><span style="display:flex;"><span> attr <span style="color:#f92672">-&gt;</span> node<span style="color:#f92672">.</span>select<span style="color:#f92672">(</span>withRawColumns<span style="color:#f92672">(</span>attr<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>parent<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">),</span> explode<span style="color:#f92672">(</span>split<span style="color:#f92672">(</span>column<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;;&#34;</span><span style="color:#f92672">)).</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>attribute<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">)))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> modifiedEntities <span style="color:#66d9ef">=</span> input<span style="color:#f92672">.</span>entities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>collect <span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k <span style="color:#66d9ef">@</span> <span style="color:#a6e22e">Node</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person&#34;</span><span style="color:#f92672">,</span> <span style="color:#66d9ef">false</span><span style="color:#f92672">),</span> df<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#75715e">// match the Person node. This is the only one ExplodeAttrs should modify +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#a6e22e">Map</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Email&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;EmailAddress&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonEmailEmailAddress&#34; entity derived by exploding the email column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Speaks&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;Language&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonSpeaksLanguage&#34; entity derived by exploding the language column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> k <span style="color:#f92672">-&gt;</span> df<span style="color:#f92672">.</span>drop<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">)</span> <span style="color:#75715e">// drop the exploded columns from person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntities <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>entities<span style="color:#f92672">)(</span><span style="color:#66d9ef">_</span> <span style="color:#f92672">++</span> <span style="color:#66d9ef">_</span><span style="color:#f92672">)</span> <span style="color:#75715e">// merge-replace the modified entities in the graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntityDefinitions <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#f92672">(</span>e<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> e <span style="color:#f92672">++</span> v<span style="color:#f92672">.</span>map<span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> k <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Some</span><span style="color:#f92672">(</span>v<span style="color:#f92672">.</span>schema<span style="color:#f92672">.</span>toDDL<span style="color:#f92672">)</span> <span style="color:#f92672">}</span> <span style="color:#75715e">// update the entity definition schema to reflect the modifications +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> l <span style="color:#66d9ef">=</span> lens<span style="color:#f92672">[</span><span style="color:#66d9ef">In</span><span style="color:#f92672">]</span> <span style="color:#75715e">// lenses provide a terse syntax for modifying nested fields +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">(</span>l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>entities<span style="color:#f92672">).</span>set<span style="color:#f92672">(</span>input<span style="color:#f92672">)((</span><span style="color:#66d9ef">true</span><span style="color:#f92672">,</span> updatedEntityDefinitions<span style="color:#f92672">,</span> updatedEntities<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Note that <code>EntityType</code> does not hold the dataset&rsquo;s full SQL schema currently, as it&rsquo;s not useful for pattern matching, but can be accessed directly from <code>DataFrame</code> if needed.</p> +<h1 id="inputoutput">Input/output</h1> +<p>The <code>Reader</code> and <code>Writer</code> typeclasses are used to read from a <code>Source</code> and write to a <code>Sink</code> respectively, terminating a graph transformation pipeline<br> +on both ends.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">T</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> read<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> exists<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">S</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Data</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> write<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Data</span><span style="color:#f92672">,</span> sink<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">S</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Unit</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>There are implementations under <code>ldbc.datagen.io.instances</code> that read a graph from a <code>GraphSource</code> and write to a <code>GraphSink</code>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> source <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSource</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>read<span style="color:#f92672">(</span>source<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span><span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>graph<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> sink <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSink</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>write<span style="color:#f92672">(</span>transformedGraph<span style="color:#f92672">,</span> sink<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>We provide <a href="https://github.com/typelevel/simulacrum">Ops syntax</a> to make it shorter:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Reader.ops._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Writer.ops._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">).</span>read +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> <span style="color:#f92672">???</span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span>transformedGraph<span style="color:#f92672">.</span>write<span style="color:#f92672">(</span><span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">))</span> +</span></span></code></pre></div><p>The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.</p> +<p>Spark has a facility to derive SparkSQL schema from case classes automatically<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup>. We created case classes for each entity in the <code>Raw</code> dataset. We also created a typeclass <code>EntityTraits</code> associating these classes with their <code>EntityType</code>, so we can summon them (and consequently their SparkSQL schema) in the reader.</p> +<p>The case classes are used during the serialization of the generated dataset too, but more about that later.</p> +<h1 id="factor-generation">Factor generation</h1> +<p>As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.</p> +<p>We decided it&rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator&rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use <a href="https://spark.apache.org/graphx/">GraphX</a>, which was unimaginable with the previous design.</p> +<p>Factor tables are added by extending a map with a <code>name -&gt; Factor</code> pair. <code>Factor</code> declares is input entities, and accepts a function that receives input <code>DataFrames</code>, and returns a single <code>DataFrame</code> as output.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> factors <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Map</span> <span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;personDisjointEmployerPairs&#34;</span> <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Factor</span><span style="color:#f92672">(</span><span style="color:#a6e22e">PersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonKnowsPersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">OrganisationType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonWorkAtCompanyType</span><span style="color:#f92672">)</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">case</span> <span style="color:#a6e22e">Seq</span><span style="color:#f92672">(</span>person<span style="color:#f92672">,</span> personKnowsPerson<span style="color:#f92672">,</span> organisation<span style="color:#f92672">,</span> workAt<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> knows <span style="color:#66d9ef">=</span> undirectedKnows<span style="color:#f92672">(</span>personKnowsPerson<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> company <span style="color:#66d9ef">=</span> organisation<span style="color:#f92672">.</span>where<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;Type&#34;</span> <span style="color:#f92672">===</span> <span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">).</span>cache<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> personSample <span style="color:#66d9ef">=</span> person +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>orderBy<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>limit<span style="color:#f92672">(</span><span style="color:#ae81ff">20</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> personSample +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person2&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>knows<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;knows&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;knows.person2Id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>workAt<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;workAt&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;workAt.PersonId&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;knows.Person1id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>company<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;Company.id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;workAt.CompanyId&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>select<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2id&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.name&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyName&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyId&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.creationDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2creationDate&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.deletionDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2deletionDate&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>distinct<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">},</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* more factors */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span></code></pre></div><p>As you can see, it&rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. <code>undirectedKnows</code>). Currently, there&rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.</p> +<h1 id="revamping-the-data-generators-serializer">Revamping the data generator&rsquo;s serializer</h1> +<p>At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator&rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we&rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.</p> +<blockquote> +<p><a href="https://parquet.apache.org/">Parquet</a> is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.</p> +</blockquote> +<p>The new serialization framework is heavily influenced by the design of Java <code>OutputStreams</code>, in the sense that stateful objects are composed to form a pipeline. For example, in case of <em>activities</em>, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).</p> +<p><img src="activity.png" alt="Activity serialization pipeline"></p> +<p>The benefit of this architecture is that only the last component needs to change when we add support for a new output format.</p> +<p>To support Parquet, we made use of row-level serializers available in Hadoop&rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the <code>Raw</code> entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. <code>Forum</code>) and Spark&rsquo;s <code>Encoder</code> framework to encode the entities in Parquet, which means that the generated output remains consistent with <code>DataFrame</code>-based reader, and we spare a lot of code duplication.</p> +<h1 id="optimizations">Optimizations</h1> +<p>After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation<sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup>. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren&rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf3k_bi <span style="color:#ae81ff">3000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">330</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf10k_bi <span style="color:#ae81ff">10000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">1000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><p>The above examples working configurations for generating the 3K and 10K BI datasets. The <code>--sf-per-executor</code> option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The <code>--partitions</code> option controls the total number of partitions, and was calculated based on the number of persons using the formula <code>partitions = ceil(number_of_persons / block_size / 3)</code> to get a maximum of 3 blocks per partition.</p> +<h1 id="conclusion">Conclusion</h1> +<p>These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.</p> +<h1 id="footnotes">Footnotes</h1> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>Shameless plug: You can learn more on this from <a href="https://www.dataversity.net/case-study-deriving-spark-encoders-and-schemas-using-implicits/">another blogpost of mine</a>.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>The datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>The maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Fifteenth TUC Meeting + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + Fri, 17 Jun 2022 09:20:00 -0500 + + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2022.sigmod.org/venue.shtml">SIGMOD 2022</a> on <strong>June 17-18 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10-15 minute talks followed by a Q&amp;A session. The talks will be recorded and made available online.<br> +The tenative program is the following. <strong>All times are in EDT.</strong></p> +<p>We will have a social event on Friday at 17:30 at <a href="https://elvezrestaurant.com/">El Vez</a> (<a href="https://g.page/ElVezPhilly">Google Maps</a>).</p> +<h4 id="friday-pennsylvania-convention-centerhttpswwwpaconventioncom-room-204bhttps2022sigmodorgprogramshtml">Friday (<a href="https://www.paconvention.com/">Pennsylvania Convention Center</a>, <a href="https://2022.sigmod.org/program.shtml">room 204B</a>)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:20</td> +<td>09:30</td> +<td>Peter Boncz (LDBC/CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/39BoOIGk9Is">video</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Alastair Green (LDBC/Birkbeck)</td> +<td>LDBC&rsquo;s fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-fair-use-of-the-ldbc-trademark.pdf">slides</a>, <a href="https://youtu.be/7zmCysN4Rpg">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)</td> +<td>LDBC Social Network Benchmark: Business Intelligence workload v1.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/gabor-szarnyas-the-ldbc-social-network-benchmark-business-intelligence-workload.pdf">slides</a>, <a href="https://youtu.be/AJ96M8_njxE">video</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Heng Lin (Ant Group)</td> +<td>LDBC Financial Benchmark introduction – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/heng-lin-ldbc-financial-benchmark-introduction.pdf">slides</a>, <a href="https://youtu.be/iBhud_YjafY">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Chen Zhang (CreateLink)</td> +<td>New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/chen-zhang-new-ldbc-snb-benchmark-record-by-galaxybase-more-than-6-times-faster-and-70-percent-higher-throughput.pdf">slides</a>, <a href="https://youtu.be/sMzTsb8iw_Y">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>James Clarkson (Neo4j)</td> +<td>LDBC benchmarks: Promoting good science and industrial consumption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/james-clarkson-ldbc-benchmarks-promoting-good-science-and-industrial-consumption.pdf">slides</a>, <a href="https://youtu.be/VYG1mzcl9qQ">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Oskar van Rest (Oracle)</td> +<td>Creating and querying property graphs in Oracle, on-premise and in the cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oskar-van-rest-creating-and-querying-property-graphs-in-oracle-on-premise-and-in-the-cloud.pdf">slides</a>, <a href="https://youtu.be/2HX2Vixf2gs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:15</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>Conquering LDBC SNB BI at SF-10k – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/mingxi-wu-conquering-ldbc-snb-bi-at-sf10k.pdf">slides</a>, <a href="https://youtu.be/oJbqzQ_t3G8">video</a></td> +</tr> +<tr> +<td>12:20</td> +<td>13:20</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:20</td> +<td>13:35</td> +<td>Altan Birler (Technische Universität München)</td> +<td>Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/altan-birler-relational-databases-can-handle-graphs-too.pdf">slides</a>, <a href="https://youtu.be/cRgbdY3I2i4">video</a></td> +</tr> +<tr> +<td>13:40</td> +<td>13:55</td> +<td>David Püroja (CWI)</td> +<td>LDBC Social Network Benchmark: Interactive workload v2.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/david-puroja-ldbc-snb-interactive-workload-v2.0.pdf">slides</a></td> +</tr> +<tr> +<td>14:00</td> +<td>14:15</td> +<td>Angela Bonifati (Lyon 1 University)</td> +<td>The quest for schemas in graph databases – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/angela-bonifati-the-quest-for-schemas-in-graph-databases.pdf">slides</a>, <a href="https://youtu.be/VT7cx3Jp7V8">video</a></td> +</tr> +<tr> +<td>14:20</td> +<td>14:35</td> +<td>Matteo Lissandrini (Aalborg University)</td> +<td>Understanding graph data representations in triplestores – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/matteo-lissandrini-understanding-graph-data-representations-in-triplestores.pdf">slides</a>, <a href="https://youtu.be/xqVMJZfh_JU">video</a></td> +</tr> +<tr> +<td>14:40</td> +<td>14:55</td> +<td>Wim Martens (University of Bayreuth)</td> +<td>Path representations – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/wim-martens-path-representations.pdf">slides</a>, <a href="https://youtu.be/Ma-E5dwgf-E">video</a></td> +</tr> +<tr> +<td>15:00</td> +<td>15:20</td> +<td>Audrey Cheng (UC Berkeley)</td> +<td>TAOBench: An end-to-end benchmark for social network workloads – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/audrey-cheng-taobench.pdf">slides</a>, <a href="https://youtu.be/1p8AStxS3es">video</a></td> +</tr> +</tbody> +</table> +<h4 id="saturday-philadelphia-marriott-downtownhttpswwwmarriottcomen-ushotelsphldt-philadelphia-marriott-downtown-room-401-402-4th-floor">Saturday (<a href="https://www.marriott.com/en-us/hotels/phldt-philadelphia-marriott-downtown/">Philadelphia Marriott Downtown</a>, room 401-402, 4th floor)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Keith Hare (WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/keith-hare-property-graph-standards-process-and-timing.pdf">slides</a>, <a href="https://youtu.be/xFVD3LWnKlc">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>10:35</td> +<td>Leonid Libkin (ENS Paris)</td> +<td>Pattern matching in GQL and SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/leonid-libkin-pattern-matching-in-gql-and-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/OvGsa0qLANE">video</a></td> +</tr> +<tr> +<td>10:40</td> +<td>10:55</td> +<td>Petra Selmer (Neo4j/WG3)</td> +<td>An overview of GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/petra-selmer-towards-gql-v1-a-property-graph-query-language-standard.pdf">slides</a>, <a href="https://youtu.be/tncf2FgyIyo">video</a></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (LDBC/WG3)</td> +<td>GQL 2.0: A technical manifesto – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-gql-2.0-a-technical-manifesto.pdf">slides</a>, <a href="https://youtu.be/upIvpYy8C2g">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>George Fletcher (TU Eindhoven)</td> +<td>PG-Keys (LDBC Property Graph Schema Working Group) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/george-fletcher-pg-keys-keys-for-property-graphs.pdf">slides</a>, <a href="https://youtu.be/_W8-jOtcObc">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Arvind Shyamsundar (Microsoft)</td> +<td>Graph capabilities in Microsoft SQL Server and Azure SQL Database – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/arvind-shyamsundar-graph-capabilities-in-microsoft-sql-server-and-azure-database.pdf">slides</a>, <a href="https://youtu.be/xxV2BfZupGw">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>13:30</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Daniël ten Wolde (CWI)</td> +<td>Implementing SQL/PGQ in DuckDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/daniel-ten-wolde-implementing-sql-pgq-in-duckdb.pdf">slides</a>, <a href="https://youtu.be/JmSfU0BTH5w">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Oszkár Semeráth, Kristóf Marussy (TU Budapest)</td> +<td>Generation techniques for consistent, realistic, diverse, and scalable graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oszkar-semerath-generation-techniques-for-consistent-realistic-diverse-and-scalable-graphs.pdf">slides</a>, <a href="https://youtu.be/hB6j6mvh-vA">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Molham Aref (RelationalAI)</td> +<td>Graph Normal Form – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/molham-aref-graph-normal-form.pdf">slides</a>, <a href="https://youtu.be/-kP4Raqr5KA">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Naomi Arnold (Queen Mary University of London)</td> +<td>Temporal graph analysis of the far-right social network Gab – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/naomi-arnold-temporal-graph-analysis-of-the-far-right-social-network-gab.pdf">slides</a>, <a href="https://youtu.be/ugSkFlif4PE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:05</td> +<td>Domagoj Vrgoč (PUC Chile)</td> +<td>Evaluating path queries in MillenniumDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/domagoj-vrgoc-regular-path-queries-in-millenniumdb.pdf">slides</a>, <a href="https://youtu.be/_OzJ6vI7GNU">video</a></td> +</tr> +<tr> +<td>15:10</td> +<td>15:25</td> +<td>Pavel Klinov, Evren Sirin (Stardog)</td> +<td>Stardog&rsquo;s experience with LDBC – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/evren-sirin-stardog-experience-with-ldbc.pdf">slides</a>, <a href="https://youtu.be/CBrEeOTqGKM">video</a></td> +</tr> +</tbody> +</table> + + + + + Announcing the LDBC Financial Benchmark Task Force + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + Thu, 26 May 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + <p>We are delighted to announce the set up of the <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench) task force</a>.</p> +<p>The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.</p> +<p>Compared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.</p> +<p>The FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">Work Charter for FinBench</a></p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.</p> + + + + + Fourteenth TUC Meeting + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + Mon, 16 Aug 2021 16:00:00 +0200 + + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + <p>LDBC was hosting a one-day hybrid workshop, co-located with <a href="https://vldb.org/2021/">VLDB 2021</a> on <strong>August 16 (Monday) between 16:00–20:00 CEST</strong>.</p> +<p>The physical part of the workshop was held in room Akvariet 2 of the <a href="https://www.tivolihotel.com/">Tivoli Hotel</a> (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC&rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.</p> +<p>Talks were scheduled to be 10 minutes with a short Q&amp;A session. We had three sessions. Their schedules are shown below.</p> +<h4 id="16001725-cest-ldbc-updates-benchmarks-query-languages">[16:00–17:25 CEST] LDBC updates, benchmarks, query languages</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>16:00</td> +<td>Peter Boncz (CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a></td> +</tr> +<tr> +<td>16:05</td> +<td>Gábor Szárnyas (CWI)</td> +<td>Overview of LDBC benchmarks – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-benchmarks.pdf">slides</a></td> +</tr> +<tr> +<td>16:12</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>LDBC Social Network Benchmark results with TigerGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mingxi-wu-tigergraph-snb-preliminary-results.pdf">slides</a></td> +</tr> +<tr> +<td>16:24</td> +<td>Xiaowei Zhu (Ant Group)</td> +<td>Financial Benchmark proposal – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/xiaowei-zhu-financial-benchmark.pdf">slides</a></td> +</tr> +<tr> +<td>16:36</td> +<td>Petra Selmer (Neo4j)</td> +<td>Status report from the Existing Languages Working Group (ELWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/petra-selmer-elwg.pdf">slides</a>, <a href="https://youtu.be/I5A8VuFDhsA">video</a></td> +</tr> +<tr> +<td>16:48</td> +<td>Jan Hidders (Birkbeck)</td> +<td>Status report from the Property Graph Schema Working Group (PGSWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/jan-hidders-pgswg.pdf">slides</a>, <a href="https://youtu.be/iEbVi9T-HVk">video</a></td> +</tr> +<tr> +<td>17:00</td> +<td>Keith Hare (JCC Consulting)</td> +<td>Database Language Standards Structure and Process, SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/keith-hare-database-language-standards-structure-and-process-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/ZgFCuzods4g">video</a></td> +</tr> +<tr> +<td>17:12</td> +<td>Stefan Plantikow (GQL Editor)</td> +<td>Report on the GQL standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/stefan-plantikow-gql.pdf">slides</a>, <a href="https://youtu.be/z0pN5NwKsgc">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="17351845-cest-systems-and-data-structures">[17:35–18:45 CEST] Systems and data structures</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>17:35</td> +<td>Vasileios Trigonakis (Oracle Labs)</td> +<td>PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasileios-trigonakis-pgxd-adfs.pdf">slides</a>, <a href="https://youtu.be/cv2ZfWRBOek">video</a></td> +</tr> +<tr> +<td>17:47</td> +<td>Matthias Hauck (SAP)</td> +<td>JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/matthias-hauck-json-spatial-graph-sap-hana-cloud.pdf">slides</a>, <a href="https://youtu.be/dgpMJFho6Q8">video</a></td> +</tr> +<tr> +<td>17:59</td> +<td>Nikolay Yakovets (Eindhoven University of Technology)</td> +<td>AvantGraph – <a href="https://youtu.be/z0pN5NwKsgcttachments/nikolay-yakovets-avantgraph.pdf">slides</a>, <a href="https://youtu.be/9M9FOycovTw">video</a></td> +</tr> +<tr> +<td>18:11</td> +<td>Semih Salihoglu (University of Waterloo)</td> +<td>GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semih-salihoglu-graindb.pdf">slides</a>, <a href="https://youtu.be/FFK3y6vPHJs">video</a></td> +</tr> +<tr> +<td>18:23</td> +<td>Semyon Grigorev (Saint Petersburg University)</td> +<td>Context-free path querying: Obstacles on the way to adoption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semyon-grigorev-cfpq.pdf">slides</a>, <a href="https://youtu.be/pha1xIpEL3I">video</a></td> +</tr> +<tr> +<td>18:35</td> +<td>Per Fuchs (Technical University of Munich)</td> +<td>Sortledton: A universal, transactional graph data structure – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/per-fuchs-sortledton.pdf">slides</a>, <a href="https://youtu.be/33ZjsNN0hhU">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="1855-2000-cest-high-level-approaches-and-benchmarks">[18:55-20:00 CEST] High-level approaches and benchmarks</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>18:55</td> +<td>Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)</td> +<td>Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/angelos-christos-anadiotis-investigative-journalism-graph-data-management.pdf">slides</a>, <a href="https://youtu.be/a1VYjyec8dg">video</a></td> +</tr> +<tr> +<td>19:07</td> +<td>Vasia Kalavri (Boston University)</td> +<td>Learning to partition unbounded graph streams – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasia-kalavri-learning-to-partition-unbounded-graph-streams.pdf">slides</a>, <a href="https://youtu.be/PTlUABKWniA">video</a></td> +</tr> +<tr> +<td>19:19</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Towards a Hybrid OLTP-OLAP Graph Benchmark – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/muhammad-attahir-jibril-hybrid-oltp-olap-benchmark.pdf">slides</a>, <a href="https://youtu.be/tMBVszTSJXc">video</a></td> +</tr> +<tr> +<td>19:31</td> +<td>Riccardo Tommasini (University of Tartu)</td> +<td>An outlook on Benchmarks for Graph Stream Processing – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/riccardo-tommasini-graph-stream-processing-benchmarks.pdf">slides</a>, <a href="https://youtu.be/HabvJvPXsLc">video</a></td> +</tr> +<tr> +<td>19:43</td> +<td>Mohamed Ragab (University of Tartu)</td> +<td>Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mohamed-ragab-benchranking.pdf">slides</a>, <a href="https://youtu.be/mZ8LhGUq7Wg">video</a></td> +</tr> +</tbody> +</table> + + + + + Thirteenth TUC Meeting + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + Tue, 30 Jun 2020 14:00:00 +0000 + + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + <p>LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.</p> +<h3 id="snb-task-force">SNB Task Force</h3> +<ul> +<li>Progress report +<ul> +<li>ACID compliance test suite</li> +<li>Integrating deletions to Datagen</li> +<li>Migrating Datagen to Spark</li> +<li>Redesign of BI read queries</li> +<li>Extensions to the driver</li> +</ul> +</li> +<li>Ongoing work +<ul> +<li>Datagen: tuning the distribution of deletes</li> +<li>Interactive 2.0 workload</li> +<li>BI 1.0 workload</li> +</ul> +</li> +</ul> +<p>Zoom links will be sent through email.</p> + + + + + Speeding Up LDBC SNB Datagen + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + Fri, 12 Jun 2020 00:00:00 +0000 + + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + <p>LDBC&rsquo;s <a href="#references">Social Network Benchmark [4]</a> (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems&rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.</p> +<p>LDBC SNB provides <a href="https://github.com/ldbc/ldbc_snb_datagen">Datagen</a> (Data Generator), which produces synthetic datasets, mimicking a social network&rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).</p> +<h2 id="overview">Overview</h2> +<p>The benchmark&rsquo;s specification describes a social network <a href="https://github.com/ldbc/ldbc_snb_docs/blob/9253abbde94ec7eaccd366c5d4c15cca30752e36/figures/schema-comfortable.pdf">data model</a> which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see <a href="#references">References</a>.</p> +<p>Datasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).</p> +<p><img src="datagen_flow.png" alt=""> \ <em>Figure 1. LDBC SNB Datagen Process on Hadoop</em></p> +<p>In the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.</p> +<p><em>Note: The diagram shows the call sequence as implemented. All steps are sequential &ndash; including the relationship generation &ndash;, even in cases when the data dependencies would allow for parallelization.</em></p> +<p>Entities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup> A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this <em>edge as property</em> representation makes the data harder to handle in SQL than it would be with a flat join table.</p> +<p>Entity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup></p> +<p>Serialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.</p> +<h2 id="motivations-for-the-migration">Motivations for the migration</h2> +<p>The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:</p> +<ul> +<li> +<p><strong>Better memory utilization:</strong> MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.</p> +</li> +<li> +<p><strong>Smaller codebase:</strong> The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.</p> +</li> +<li> +<p><strong>Small entry cost:</strong> Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.</p> +</li> +<li> +<p><strong>Incremental improvements:</strong> Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.</p> +</li> +<li> +<p><strong>OSS, commodity:</strong> Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.</p> +</li> +</ul> +<h2 id="first-steps">First steps</h2> +<p>The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.</p> +<ul> +<li> +<p><strong>Use your memory:</strong> A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using <code>MEMORY_AND_DISK</code>). In short, the default caching strategy was used everywhere.</p> +</li> +<li> +<p><strong>Regression tests:</strong> Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.</p> +</li> +<li> +<p><strong>Thread-safety concerns:</strong> Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn&rsquo;t a concern in the original implementation due to the fact that MapReduce doesn&rsquo;t use thread-based parallelization for mappers and reducers.<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, <sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup> which makes it somewhat harder to find potential unguarded shared variables.</p> +</li> +</ul> +<h2 id="case-study-person-ranking">Case study: Person ranking</h2> +<p>Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the <a href="#references">S3G2 paper [3]</a>.</p> +<h3 id="the-original-mapreduce-version">The original MapReduce version</h3> +<p><img src="person_ranking.svg" alt=""> \ <em>Figure 2. Diagram of the MapReduce code for ranking persons</em></p> +<p>The implementation, shown in pseudocode above, works as follows:</p> +<ol> +<li>The equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low.</li> +<li>The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate &ldquo;side-channel&rdquo; file upon the completion of a reduce task.</li> +<li>In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person.</li> +</ol> +<p>Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.</p> +<h3 id="the-migrated-version">The migrated version</h3> +<p>Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.</p> +<h2 id="benchmarks">Benchmarks</h2> +<p>Benchmarks were carried out on AWS <a href="https://aws.amazon.com/emr/">EMR</a>, originally utilising <a href="https://aws.amazon.com/ec2/instance-types/i3/">i3.xlarge</a> instances because of their fast NVMe SSD storage and ample amount of RAM.</p> +<p>The application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.<sup id="fnref:5"><a href="#fn:5" class="footnote-ref" role="doc-noteref">5</a></sup> The MapReduce results were as follows:</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>16</td> +<td>1.60</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>34</td> +<td>1.13</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>40</td> +<td>1.20</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>44</td> +<td>1.32</td> +</tr> +</tbody> +</table> +<p>It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.</p> +<p><img src="mr_sf100_cpu_load.png" alt=""> <br> +<em>Figure 3. CPU Load for the Map Reduce cluster is bursty and less than<br> +50% on average (SF100, 2nd graph shows master)</em></p> +<p><img src="mr_sf100_mem_free.png" alt=""> <br> +<em>Figure 4. The job only starts to consume memory when already 10 minutes<br> +into the run (SF100, 2nd graph shows master)</em></p> +<p>Let&rsquo;s see how Spark fares.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>10</td> +<td>1.00</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>21</td> +<td>0.70</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>27</td> +<td>0.81</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>36</td> +<td>1.08</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +</tbody> +</table> +<p>A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn&rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.</p> +<p><img src="spark_sf100_cpu_load.png" alt=""> <br> +<em>Figure 5. Full CPU utilization for Spark (SF100, last graph shows<br> +master)</em></p> +<p><img src="spark_sf100_mem_free.png" alt=""> <br> +<em>Figure 6. Spark eats up memory fast (SF100, 2nd graph shows master)</em></p> +<p>i3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it <em>only</em> has a 300 GB SSD.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>16</td> +<td>0.48</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>21</td> +<td>0.63</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>26</td> +<td>0.78</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +<tr> +<td>10000</td> +<td>303</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<p>The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.</p> +<h2 id="next-steps">Next steps</h2> +<p>The next improvement is refactoring the serializers so they use Spark&rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.</p> +<p>As already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.</p> +<p>The Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for <a href="#references">generating delete operations [1]</a>. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!</p> +<h2 id="references">References</h2> +<p>[1] <a href="https://ldbcouncil.org/docs/papers/datagen-deletes-grades-nda-2020.pdf">Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark&rsquo;s Data Generator</a></p> +<p>[2] <a href="https://www.youtube.com/watch?v=ZQOLuCOOpSI">9th TUC Meeting &ndash; LDBC SNB Datagen Update &ndash; Arnau Prat (UPC)</a> - <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">slides</a></p> +<p>[3] <a href="https://research.vu.nl/en/publications/s3g2-a-scalable-structure-correlated-social-graph-generator">S3G2: a Scalable Structure-correlated Social Graph Generator</a></p> +<p>[4] <a href="https://arxiv.org/abs/2001.02299">The LDBC Social Network Benchmark</a></p> +<p>[5] <a href="https://ldbcouncil.org/">LDBC</a> - <a href="https://github.com/ldbc">LDBC GitHub organization</a></p> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>Also makes it easier to map to a tabular format thus it is a SQL friendly representation.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>It&rsquo;s hard to imagine this done declaratively in SQL.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>Instead, multiple YARN containers have to be used if you want to parallelize on the same machine.&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>Although editors usually render these using different font styles.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:5"> +<p>With the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.&#160;<a href="#fnref:5" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Twelfth TUC Meeting + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + Fri, 05 Jul 2019 08:30:00 +0100 + + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + <p>LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event on the last Friday of <strong><a href="https://sigmod2019.org/">SIGMOD/PODS 2019</a></strong> in Amsterdam, The Netherlands, in the conference venue of <strong><a href="http://sigmod2019.org/conf_venue">Beurs van Berlage</a></strong>. The room is the Mendes da Silva kamer. Please check its tips for <strong><a href="http://sigmod2019.org/accommodation">accommodation in Amsterdam</a></strong>.</p> +<p>Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2019">GRADES-NDA 2019</a>, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.</p> +<p><strong>Talk proposals can be sent to Peter Boncz</strong>, who is also the local organizer. <strong>Please also send your slides to this email for archiving on this site.</strong></p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting, there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).</p> +<p>The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (<strong>room: Mendes da Silva kamer</strong>):</p> +<p>08:30-10:30 LDBC Board Meeting (non-public)</p> +<p>10:30-11:00 Coffee</p> +<p>11:00-12:45 Session 1: Graph Benchmarks</p> +<ul> +<li> +<p>11:00-11:05 Welcome &amp; introduction</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230404.pdf">11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706117.pdf">11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706118.pdf">12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706130.pdf">12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics</a></p> +</li> +</ul> +<p>12:45-14:00 Lunch</p> +<p>14:00-16:05 Session 2: Graph Query Languages</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706120.pdf">14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706121.pdf">14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706122.pdf">report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706119.pdf">14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706129.pdf">15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph&rsquo;s query language</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230401.pdf">15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language</a></p> +</li> +</ul> +<p>16:05-16:30 Coffee</p> +<p>16:30-17:50 Session 3: Graph System Performance</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111968258.pdf">16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706124.pdf">16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity</a> <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706116.pptx">pptx</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706128.pdf">17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706133.pdf">17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data</a></p> +</li> +</ul> +<p>If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.</p> + + + + + Eleventh TUC Meeting + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + Fri, 08 Jun 2018 08:30:00 -0500 + + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + <p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event preceding the <a href="https://sigmod2018.org/">SIGMOD/PODS 2018</a> conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2018/">GRADES-NDA 2018</a> as well, so you might combine travel.</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (<a href="mailto:boncz@cwi.nl">boncz@cwi.nl</a>) and Larri (<a href="mailto:larri@ac.upc.ed">larri@ac.upc.edu</a>). Local organizer is Juan Sequeda (<a href="mailto:juanfederico@gmail.com">juanfederico@gmail.com</a>).</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00:</p> +<ul> +<li> +<p>10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090478.pdf">10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo</a></p> +</li> +<li> +<p>11:00-11:30 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090466.pdf">11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090463.pdf">11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090472.pdf">12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI</a></p> +</li> +<li> +<p>12:45-14:00 lunch</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090474.pdf">14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090481.pdf">14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090485.pdf">14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service</a></p> +</li> +<li> +<p>15:15-15:40 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99811329.pdf">15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99287041.pdf">16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99745793.pdf">16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph</a></p> +</li> +<li> +<p><a href="http://relational.ai/">16:55-17:20 Molham Aref (relational.ai)</a>) - Introducing.. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99418113.pdf">relational.ai</a></p> +</li> +<li> +<p>18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701</p> +</li> +</ul> +<h3 id="location">Location</h3> +<p>The TUC will be held at the <a href="https://www.cs.utexas.edu/">University of Texas at Austin, Department of Computer Science</a> in the <a href="https://www.google.com/maps/place/The+University+of+Texas:+Department+of+Computer+Science/@30.2860955,-97.737582,18z/data=!4m5!3m4!1s0x0:0x12edecc8226b3241!8m2!3d30.2862279!4d-97.7365348">Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712</a> Room: GDC 6.302</p> +<p>The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.</p> +<h3 id="from-austin-to-sigmodpods-houston-on-saturday-june-9">From Austin to SIGMOD/PODS (Houston) on Saturday June 9</h3> +<p>Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.</p> +<h4 id="bus">Bus</h4> +<p>One option is to take a <a href="https://us.megabus.com/journey-planner/journeys?days=1&amp;concessionCount=0&amp;departureDate=2018-06-09&amp;destinationId=318&amp;inboundOtherDisabilityCount=0&amp;inboundPcaCount=0&amp;inboundWheelchairSeated=0&amp;nusCount=0&amp;originId=320&amp;otherDisabilityCount=0&amp;pcaCount=0&amp;totalPassengers=1&amp;wheelchairSeated=0">MegaBus that departs from downtown Austin and arrives at downtown Houston</a>.</p> +<p>There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).</p> +<p>If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).</p> + + + + + Tenth TUC Meeting + https://ldbcouncil.org/event/tenth-tuc-meeting/ + Fri, 01 Sep 2017 10:30:00 +0100 + + https://ldbcouncil.org/event/tenth-tuc-meeting/ + <p>This will be a one-day event at the <a href="http://www.vldb.org/2017">VLDB 2017</a> conference in Munich, Germany on September 1, 2017.</p> +<p>Topics and activities of interest in these TUC meetings are:</p> +<ul> +<li>Presentation on graph data management usage scenarios.</li> +<li>Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at <a href="mailto:adiaz@ac.upc.edu">adiaz@ac.upc.edu</a> to register; registration is free, but required.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00</p> +<p>10:30-12:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87588865.pdf">Peter Boncz (CWI): GraphQL task force update - the G-CORE proposal</a> (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868018.pptx">pptx</a>)</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868008.pdf">Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload</a></li> +<li>Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868014.pdf">LDBC Graphalytics v0.9</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868013.pdf">Graphalytics Global Competition and Graphalytics Custom Benchmark</a></li> +</ul> +<p>12:00-13:30: lunch break</p> +<p>13:30-15:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868024.pdf">Arnau Prat (UPC): Datasynth: Democratizing property graph generation</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868026.pdf">Marcus Paradies (SAP): SAP HANA GraphScript</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031809.pdf">Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform</a></li> +<li>Gaétan Hains (Huawei): Cost semantics for graph queries</li> +</ul> +<p>15:00-15:30: break</p> +<p>15:30-17:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031812.pdf">Petra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87195650.pdf">Markus Kaindl (Springer): SN SciGraph &ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain</a></li> +<li>Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks</li> +<li>Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset</li> +</ul> +<p>Speakers should aim for a <strong>20-minute talk</strong>.</p> +<p>Further:</p> +<ul> +<li>on Friday evening (19:00-21:00) there will be a <strong>social dinner</strong> at <a href="https://www.loewenbraeukeller.com/en/pub-and-beer-garden/">Löwenbräukeller</a>, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).</li> +<li>on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.</li> +</ul> +<h3 id="venue">Venue</h3> +<p>The Technical University of Munich (TUM) is hosting that week the <a href="http://www.vldb.org/2017">VLDB conference</a>; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.</p> +<p>The TUC meeting will be held in in <strong>Room 2607</strong> alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).</p> +<p><strong>address: Technische Universität München (TUM), Arcisstraße 21, 80333 München</strong></p> +<p><a href="https://www.google.nl/maps/place/Technische+Universit%C3%A4t+M%C3%BCnchen/@48.14966,11.5656715,17z/data=!3m1!4b1!4m5!3m4!1s0x479e7261336d8c11:0x79a04d44dc5bf19d!8m2!3d48.14966!4d11.5678602?hl=en">Google Maps</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920002.jpg" alt=""><br> +<img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920003.jpg" alt=""></p> + + + + + Ninth TUC Meeting + https://ldbcouncil.org/event/ninth-tuc-meeting/ + Thu, 09 Feb 2017 15:07:18 -0400 + + https://ldbcouncil.org/event/ninth-tuc-meeting/ + <p>LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> in Walldorf, Germany on February 9+10, 2017.</p> +<p>This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>;</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.</p> +<p>Thursday evening (19:00-21:00) there will be a <strong>social dinner</strong> in Heidelberg.</p> +<p>Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.</p> +<h4 id="social-dinner">Social Dinner</h4> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235334.png" alt=""></p> +<p><strong>Address: Hauptstraße 217, 69117 Heidelberg</strong><br> +<strong>Time: 19:00 / 7pm</strong></p> +<p>(See attachments at the bottom of the page)</p> +<h5 id="thursday">Thursday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>Welcome and logistics - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>9:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235329.pdf">Intro + state of the LDBC - Josep Lluis Larriba Pey</a> (UPC)</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235338.pdf">LDBC Graph QL task force</a> - Hannes Voigt (TU Dresden)</td> +</tr> +<tr> +<td>9:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235335.pdf">PGQL Status Update and Comparison to LDBC&rsquo;s Graph QL proposals</a> - Oskar van Rest (Oracle Labs)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628546.pdf">Adding shortest-paths to MonetDB</a> - Dean de Leo (CWI)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431939.pdf">Evolving Cypher for processing multiple graphs</a> - Stefan Plantikow (Neo Technology)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235346.pdf">Standardizing Graph Database Functionality - An Invitation to Collaborate</a> - Jan Michels (ISO/ANSI SQL, Oracle)&quot;</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235343.pdf">Dgraph: Graph database for production environment</a> - Tomasz Zdybal (Dgraph.io)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431945.pdf">LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap</a> - Alexandru Iosup (TU Delft)</td> +</tr> +<tr> +<td>13:20</td> +<td>LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)</td> +</tr> +<tr> +<td>13:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">LDBC SNB Datagen Update</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431943.pdf">LDBC SNB Business Intelligence Workload: Chokepoint Analysis</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431947.pdf">LDBC Benchmark Cost Specification</a> (+discussion) - Moritz Kaufmann (TU Munich)</td> +</tr> +<tr> +<td>14:40</td> +<td>coffee break</td> +</tr> +<tr> +<td>15:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76316673.pdf">EYWA: the Distributed Graph Engine in Huawei MIND Platform</a> (Yinglong Xia)</td> +</tr> +<tr> +<td>15:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431949.pdf">Graph Processing in SAP HANA</a> - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>15:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628563.pdf">Distributed Graph Analytics with Gradoop</a> - Martin Junghanns (Univ Leipzig)</td> +</tr> +<tr> +<td>16:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152834.pdf">Distributed graph flows: Cypher on Flink and Gradoop</a> - Max Kießling (Neo Technology)</td> +</tr> +<tr> +<td>16:30</td> +<td>closing - Peter Boncz</td> +</tr> +<tr> +<td>17:30</td> +<td>end</td> +</tr> +</tbody> +</table> +<h5 id="friday">Friday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>welcome - Peter Boncz</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152833.pdf">Graph processing in obi4wan</a> - Frank Smit (OBI4WAN)</td> +</tr> +<tr> +<td>9:40</td> +<td>Graph problems in the space domain - Albrecht Schmidt (ESA)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75792387.pdf">Medical Ontologies for Healthcare</a> - Michael Neumann (SAP)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76447745.pdf">The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries</a> - Gabor Szarnyas (BME)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76021761.pdf">Efficient sparse matrix computations and their generalization to graph computing applications</a> - Albert-Jan Yzelman (Huawei)</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152837.pdf">Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge</a> - Atanas Kyriakov (Ontotext)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td>LDBC Board of Directors Meeting</td> +</tr> +<tr> +<td>17:00</td> +<td>end</td> +</tr> +</tbody> +</table> +<h3 id="logistics">Logistics</h3> +<h5 id="important-things-to-know"><strong>Important things to know</strong></h5> +<p>The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">link</a></p> +<h5 id="venue"><strong>Venue</strong></h5> +<p>The TUC meeting will be held in the <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> at the SAP Guesthouse Kalipeh (<a href="https://www.kalipeh.com">https://www.kalipeh.com</a>). The address is:</p> +<p><strong>WDF 44 / SAP Guesthouse Kalipeh<br> +Dietmar-Hopp-Allee 15<br> +69190 Walldorf<br> +Germany</strong></p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p><a href="https://www.google.com/maps/place/SAP+Guesthouse+Kalipeh/@49.2951903,8.6436224,17z/data=!3m1!4b1!4m5!3m4!1s0x4797bea343a566af:0xd70698f3503ab74b!8m2!3d49.2951868!4d8.6458111">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/69042180.png" alt=""></p> +<h4 id="getting-there"><strong>Getting there</strong></h4> +<h5 id="by-plane"><strong>By plane</strong></h5> +<p>There are two airports close to SAP&rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.</p> +<p>When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.</p> +<p>The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).</p> +<p>Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).</p> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<p><strong>Traveling from Frankfurt Airport (FRA) to SAP Headquarters:</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.&rdquo;</li> +<li>Follow the A5 to &ldquo;Basel/Karlsruhe/Heidelberg.&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<p>(Should you use a navigational system which does not recognize the street name &lsquo;Dietmar-Hopp-Allee&rsquo; please use &lsquo;Neurottstrasse&rsquo; instead.)</p> +<p><strong>Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:</strong></p> +<p>To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.</p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A8/Stuttgart/B27.&rdquo;</li> +<li>Stay on A8 and follow the sign for &ldquo;Karlsruhe/Heilbronn/Singen/A8.&rdquo;</li> +<li>Follow A8 to Karlsruhe.</li> +<li>Take exit 41 &ndash; &ldquo;Dreieck Karlsruhe&rdquo; to merge onto A5 toward &ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<h6 id="parking"><strong>Parking</strong></h6> +<p>The closest parking lot to the event location is P7 (see figure above).</p> +<h5 id="by-train"><strong>By Train</strong></h5> +<p>As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.</p> +<p><strong>From Frankfurt Airport (FRA) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to Terminal 1, level T (see overview in Appendix).</li> +<li>Go to the AIRail Terminal &ndash; &ldquo;Fernbahnhof&rdquo; (long-distance trains).</li> +<li>Choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP.&rdquo; It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> +<p><strong>From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to the S-Bahn station in the airport, following the sign (station is called &ldquo;Stuttgart Flughafen/Messe&rdquo;).</li> +<li>Take train number S2 or S3 to &ldquo;Stuttgart Hauptbahnhof&rdquo; (main station).</li> +<li>From Stuttgart Hauptbahnhof choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP&rdquo;. It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> + + + + + LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + Tue, 06 Sep 2016 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + <p>LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.</p> +<p>LDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.</p> +<p>Tim Hegeman of <a href="https://www.tudelft.nl">TU Delft</a> is today presenting the technical paper describing LDBC Graphalytics at the important <a href="https://www.vldb.org/conference.html">VLDB</a> (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.</p> +<p>LDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.</p> +<p>Learn more: [/ldbc-graphalytics](LDBC Graphalytics)</p> +<p>GitHub: <a href="https://github.com/tudelft-atlarge/graphalytics">https://github.com/tudelft-atlarge/graphalytics</a></p> + + + + + Eighth TUC Meeting + https://ldbcouncil.org/event/eighth-tuc-meeting/ + Wed, 22 Jun 2016 14:45:20 -0400 + + https://ldbcouncil.org/event/eighth-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event/eighth-tuc-meeting/attachments at <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a> in Redwood Shores facility on <strong>Wednesday and Thursday June 22-23, 2016</strong>.</p> +<p>This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:</p> +<ul> +<li>Two day event/eighth-tuc-meeting/attachments with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify Oracle security in advance, registration requests need to be in by <strong>June 12</strong>.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +<li><a href="#accommodation">Accommodation</a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.</p> +<h6 id="wednesday-22th-of-june-2016-room-203"><strong>Wednesday, 22th of June 2016 (<strong>Room 203)</strong></strong></h6> +<p>(full morning: LDBC Board of Directors meeting)</p> +<ul> +<li>12:00 - 13:00 Lunch (provided)</li> +<li>13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome.</li> +<li>13:30 - 14:00 Peter Boncz (CWI) <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133891.pdf">LDBC introduction and status update</a>.</li> +<li>14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</li> +<li>14:00 Arnau Prat (DAMA-UPC). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133902.pdf">Social Network Benchmark, Interactive workload</a>.</li> +<li>14:30 Tim Hegeman (TU Delft). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133893.pdf">Social Network Benchmark, Analytics workload</a>.</li> +<li>15:00 - 15:30 Coffee break</li> +<li>15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133897.pdf">Graphing Healthcare Networks: Data, Analytics, and Use Cases.</a></li> +<li>16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133901.pdf">Frappé: Querying and managing evolving code dependency graphs</a>.</li> +<li>16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133895.pdf">UniProt: challenges of a public SPARQL endpoint.</a></li> +</ul> +</li> +<li>17:00 - 18:30 Graph Technologies (chair Peter Boncz) +<ul> +<li>17:00 Eugene I. Chong (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133904.pdf">Balancing Act to improve RDF Query Performance in Oracle Database</a>.</li> +<li>17:30 Lijun Chang (University of New South Wales). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133906.pdf">Efficient Subgraph Matching by Postponing Cartesian Products</a>.</li> +<li>18:00 Weining Qian (East China Normal University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133908.pdf">On Statistical Characteristics of Real-Life Knowledge Graphs</a>.</li> +</ul> +</li> +</ul> +<h6 id="thursday-23th-of-june-2016-room-203"><strong>Thursday, 23th of June 2016 (Room 203)</strong></h6> +<ul> +<li>08:00 - 09:00 Breakfast (provided)</li> +<li>09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) +<ul> +<li>09:00 Peter Boncz (CWI). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133896.pdf">Query Language Task Force status</a></li> +<li>09:45 Marcus Paradies (SAP). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297729.pdf">Social Network Benchmark, Business Intelligence workload</a></li> +</ul> +</li> +<li>10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) +<ul> +<li>10:00 Sergey Edunov (Facebook). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297731.pdf">Generating realistic trillion-edge graphs</a></li> +<li>10:30 George Fletcher (TU Eindhoven). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297733.pdf">An open source framework for schema-driven graph instance and graph query workload generation</a>.</li> +<li>11:00 Yinglong Xia (Huawei Research America): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297735.pdf">An Efficient Big Graph Analytics Platform</a>.</li> +<li>11:30 Zhe Wu (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297737.pdf">Bridging RDF Graph and Property Graph Data Models</a></li> +</ul> +</li> +<li>12:00 - 13:30 Lunch (provided)</li> +<li>13:30 - 15:30 Graph Technologies (chair Arnau Prat) +<ul> +<li>13:30 Tobias Lindaaker (Neo Technology). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297740.pdf">An open standard for graph queries: the Cypher contribution</a></li> +<li>14:00 Arash Termehchy (Oregon State University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297742.pdf">Toward Representation Independent Graph Querying &amp; Analytics</a></li> +<li>14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297745.pdf">In the service of the federation</a></li> +<li>15:00 Nandish Jayaram (Pivotal). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297747.pdf">Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs</a>.</li> +</ul> +</li> +<li>15:30 - 16:00 Coffee break</li> +<li>16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>16:00 Jans Aasman (Franz Inc.). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428806.pdf">Semantic Data Lake for Healthcare</a></li> +<li>16:15 Kevin Madden (Tom Sawyer Software). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428812.pdf">Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis</a></li> +<li>16:45 Juan Sequeda (Capsenta). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428810.pdf">Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources</a></li> +<li>17:15 Kevin Wilkinson (Hewlett Packard Labs). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428808.pdf">LDBC SNB extensions</a></li> +</ul> +</li> +<li>17:45 - 18:15 Closing discussion</li> +</ul> +<h6 id="friday-24th-of-june-2016-room-105"><strong>Friday, 24th of June 2016 (Room 105)</strong></h6> +<p>At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (<strong>GRADES16</strong>).</p> +<p>18:30 social dinner for GRADES registrants (place to be announced)</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>22nd and 23rd June 2016</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a></p> +<p>The address is:</p> +<p><strong>Room 203 (Wed-Thu) &amp; Room 105 (Fri)</strong><br> +<strong>Oracle Conference Center</strong><br> +<strong>350 Oracle Parkway</strong><br> +<strong>Redwood City, CA 94065, USA</strong></p> +<p><strong>Maps and situation</strong></p> +<p><a href="https://www.google.com/maps/place/Oracle+Conference+Center/@37.5322827,-122.2667034,17z/data=!3m1!4b1!4m2!3m1!1s0x808f98b5450e8ca3:0xdc75e8b1c02bbb91">Google Maps link</a></p> +<p>Oracle Campus map:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/40927234.jpg" alt=""></p> +<h5 id="getting-there"><strong>Getting there</strong></h5> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<ul> +<li>[Southbound] <strong>-</strong> Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +<li>[Northbound] <strong>-</strong> Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +</ul> +<h5 id="parking"><strong>Parking</strong></h5> +<p>The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.</p> +<h5 id="public-transport"><strong>Public transport</strong></h5> +<p>Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.</p> +<ul> +<li>Caltrain timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/weekdaytimetable.html</a></li> +<li>Oracle Shuttle timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html</a></li> +</ul> +<p>You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.</p> +<p>Alternatively, SamTrans (San Mateo County&rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.</p> + + + + + LDBC and Apache Flink + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + Mon, 16 Nov 2015 14:47:00 +0000 + + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + <p>Apache Flink <a href="#references">[1]</a> is an open source platform for distributed stream and batch data processing. Flink&rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.</p> +<p><img src="https://flink.apache.org/img/flink-stack-small.png" alt=""></p> +<p>Flink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.</p> +<p>The following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">&gt;</span> text <span style="color:#f92672">=</span> env<span style="color:#f92672">.</span><span style="color:#a6e22e">fromElements</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the past controls the future.&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the present controls the past.&#34;</span><span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>Tuple2<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">,</span> Integer<span style="color:#f92672">&gt;&gt;</span> wordCounts <span style="color:#f92672">=</span> text +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">flatMap</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> LineSplitter<span style="color:#f92672">())</span> <span style="color:#75715e">// splits the line and outputs (word,1) +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>tuples<span style="color:#f92672">.</span><span style="color:#a6e22e">groupBy</span><span style="color:#f92672">(</span><span style="color:#ae81ff">0</span><span style="color:#f92672">)</span> <span style="color:#75715e">// group by word +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">.</span><span style="color:#a6e22e">sum</span><span style="color:#f92672">(</span><span style="color:#ae81ff">1</span><span style="color:#f92672">);</span> <span style="color:#75715e">// sum the 1&#39;s +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>wordCounts<span style="color:#f92672">.</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop <a href="#references">[2]</a>. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing <a href="#references">[3]</a>. Using the class <code>LDBCToFlink</code>, LDBC output files can be read directly from HDFS or from the local file system:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;hdfs:///ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> <span style="color:#75715e">// or &#34;/path/to/social_network&#34; +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> vertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> edges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The tuple classes <code>LDBCVertex</code> and <code>LDBCEdge</code> hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.</p> +<p>Each <code>LDBCVertex</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all vertices * a vertex label (e.g. <code>Person</code>, <code>Comment</code>) * a key-value map of properties including also multivalued properties<br> +(e.g. <code>Person.email</code>)</li> +</ul> +<p>Each <code>LDBCEdge</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all edges</li> +<li>an edge label (e.g. <code>knows</code>, <code>likes</code>)</li> +<li>a source vertex identifier</li> +<li>a target vertex identifier</li> +<li>a key-value map of properties</li> +</ul> +<p>The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label <code>Person</code> and edges with the label <code>knows</code> and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub <a href="#references">[4]</a>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter vertices with label “Person” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> ldbcVertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">VERTEX_CLASS_PERSON</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter edges with label “knows” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> ldbcEdges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">EDGE_CLASS_KNOWS</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly vertices suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> vertices <span style="color:#f92672">=</span> ldbcVertices<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly edges suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Edge<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;&gt;</span> edges <span style="color:#f92672">=</span> ldbcEdges<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>Graph<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;</span> g <span style="color:#f92672">=</span> Graph<span style="color:#f92672">.</span><span style="color:#a6e22e">fromDataSet</span><span style="color:#f92672">(</span>vertices<span style="color:#f92672">,</span> edges<span style="color:#f92672">,</span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// run connected components on the subgraph for 10 iterations +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> components <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> g<span style="color:#f92672">.</span><span style="color:#a6e22e">run</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> ConnectedComponents<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// print the component id of the first 10 vertices +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>components<span style="color:#f92672">.</span><span style="color:#a6e22e">first</span><span style="color:#f92672">(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">).</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The ldbc-flink-import tool is available on Github <a href="#references">[3]</a> and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.</p> +<p>If you want to learn more about Apache Flink, a good starting point is the main documentation <a href="#references">[5]</a> and if you have any question feel free to ask the official mailing lists.<br> +There is also a nice set of videos <a href="#references">[6]</a> available from the latest Flink Forward conference.</p> +<h4 id="references">References</h4> +<p>[1] <a href="http://flink.apache.org/">http://flink.apache.org/</a></p> +<p>[2] <a href="https://github.com/dbs-leipzig/gradoop">https://github.com/dbs-leipzig/gradoop</a></p> +<p>[3] <a href="https://github.com/s1ck/ldbc-flink-import">https://github.com/s1ck/ldbc-flink-import</a></p> +<p>[4] <a href="https://gist.github.com/s1ck/b33e6a4874c15c35cd16">https://gist.github.com/s1ck/b33e6a4874c15c35cd16</a></p> +<p>[5] <a href="https://ci.apache.org/projects/flink/flink-docs-release-0.10/">https://ci.apache.org/projects/flink/flink-docs-release-0.10/</a></p> +<p>[6] <a href="https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA">https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA</a></p> + + + + + Seventh TUC Meeting + https://ldbcouncil.org/event/seventh-tuc-meeting/ + Mon, 09 Nov 2015 14:17:30 -0400 + + https://ldbcouncil.org/event/seventh-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="http://www.research.ibm.com/labs/watson">IBM&rsquo;s TJ Watson</a> facility on <strong>Monday and Tuesday November 9/10, 2015.</strong></p> +<p>This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify IBM security in advance, registration requests need to be in by Nov 1.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a><br> +- <a href="#date"><strong>Date</strong></a><br> +- <a href="#venue"><strong>Venue</strong></a><br> +- <a href="#maps-and-situation"><strong>Maps and situation</strong></a><br> +- <a href="#getting-there"><strong>Getting there</strong></a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>Monday, 9th of November 2015</strong></p> +<p>8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)</p> +<p>9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)</p> +<p>9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</p> +<p>9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload</p> +<p>10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload</p> +<p>10:30-11:00 Coffee break</p> +<p>11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)</p> +<p>11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.</p> +<p>11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.</p> +<p>12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)</p> +<p>14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox</p> +<p>14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today&rsquo;s architectures</p> +<p>15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G</p> +<p>15:30-16:00 Coffee break</p> +<p>16:00 - 17:00 Technologies (chair Irini Fundulaki)</p> +<p>16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store</p> +<p>16:30 David Ediger (GeorgiaTech). STINGER</p> +<p>17:00 Gary King (Franz Inc.). AllegroGraph&rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties</p> +<p>17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics</p> +<p>18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase</p> +<p>19:00 Social dinner</p> +<p><strong>Tuesday 10th November 2015</strong></p> +<p>9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)</p> +<p>9:00 Philip Rathle (Neo). On openCypher</p> +<p>9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification</p> +<p>9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions</p> +<p>10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation</p> +<p>10:30 - 11:00 Coffee break</p> +<p>11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)</p> +<p>11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL</p> +<p>11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,</p> +<p>11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis</p> +<p>12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>9th and 10th November 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the IBM Thomas J Watson Research Center.<br> +The address is:</p> +<p><strong>IBM Thomas J Watson Research Center</strong><br> +<strong>1101 Kitchawan Rd,</strong><br> +<strong>Yorktown Heights, NY 10598, USA</strong></p> +<p>If you are using a <em>GPS system</em>, please enter <strong>&ldquo;200 Aqueduct Road, Ossining NY, 10562&rdquo;</strong> for accurate directions to the lab entrance. You may also want to check the routing online.</p> +<p>The meeting will take place in the <em>Auditorium</em> on November 9th, and in Meeting Room <em>20-043</em> on November 10th.</p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p>You are highly suggested to <strong>rent a car</strong> for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through <a href="http://www.orbitz.com">www.orbitz.com</a>, or <a href="http://www.expedia.com">www.expedia.com</a> Feel free to email <a href="mailto:yxia@us.ibm.com">yxia@us.ibm.com</a> for any questions.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/seventh-tuc-meeting/attachments/6882333/15926330.png" alt=""></p> +<h6 id="getting-there"><strong>Getting there</strong></h6> +<p><strong>Upper and Eastern New England</strong></p> +<p>Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.</p> +<p><strong>New Haven and Connecticut Shores</strong></p> +<p>Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New Jersey</strong></p> +<p>Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>Upstate New York</strong></p> +<p>Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New York City (Manhattan)</strong></p> +<p>Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>John F. Kennedy International Airport</strong></p> +<p>North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>LaGuardia Airport</strong></p> +<p>East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.</p> +<p><strong>Newark International Airport</strong></p> +<p>North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.</p> +<p><strong>Stewart International Airport</strong></p> +<p>Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.</p> +<p><strong>Westchester County Airport</strong></p> +<p>Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.</p> +<p><strong>Public Transportation</strong></p> +<p>Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.</p> + + + + + Elements of Instance Matching Benchmarks: a Short Overview + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + Tue, 16 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + <p>The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using <strong>instance matching</strong> techniques and tools. Instance matching is also known as <strong>record linkage</strong> <a href="#references">[1]</a>, <strong>duplicate detection</strong> <a href="#references">[2]</a>, <strong>entity resolution</strong> <a href="#references">[3]</a> and <strong>object identification</strong> <a href="#references">[4]</a>.</p> +<p>For instance, a search in Geonames (<a href="http://www.geonames.org/">http://www.geonames.org/</a>) for &ldquo;Athens&rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (<a href="http://dbpedia.org/">http://dbpedia.org/</a>) or Open Government Datasets (<a href="http://data.gov.gr/">http://data.gov.gr/</a>). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.</p> +<p>Web resources are published by &ldquo;autonomous agents&rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years <a href="#references">[2]</a><a href="#references">[5]</a>.</p> +<p>It is though essential at this point to develop, along with instance and entity matching systems, <em>instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs</em>. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.</p> +<p>An instance matching benchmark for Linked Data consists of a <em>source</em> and <em>target dataset</em> implementing a set of <em>test-cases</em>, where each test case addresses a different kind of requirement regarding instance matching, a <em>ground truth</em> or <em>gold standard</em> and finally the <em>evaluation metrics</em> used to <em>assess the benchmark.</em></p> +<p>Datasets are the raw material of a benchmark. A benchmark comprises of a <em>source</em> and <em>target</em> dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their <em>nature</em> (<em>real</em> or <em>synthetic</em>), (b) the <em>schemas/ontologies</em> they use, (c) their <em>domains</em>, (d) the <em>languages</em> they are written in, and (e) the <em>variations/heterogeneities</em> of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. <em>Synthetic datasets</em> are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner</p> +<p>Datasets (and benchmarks) may contain different <em>kinds of variations</em> that correspond to <em>different test cases</em>. According to Ferrara et.al. <a href="#references">[6]</a><a href="#references">[7]</a>, three kinds of variations exist for Linked Data, namely <em>data variations</em>, <em>structural variations</em> and <em>logical variations</em>. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.</p> +<p>The common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.</p> +<p>The <em>gold standard</em> is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.</p> +<p>Last, an instance matching benchmark uses <em>evaluation metrics</em> to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard <em>precision</em>, <em>recall</em> and <em>f-measure</em> metrics.</p> +<h4 id="references">References</h4> +<p>[1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.</p> +<p>[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).</p> +<p>[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.</p> +<p>[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.</p> +<p>[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.</p> +<p>[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).</p> +<p>[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + Wed, 10 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + <p>In this post we will look at running the <a href="https://ldbcouncil.org/developer/snb">LDBC SNB</a> on <a href="https://virtuoso.openlinksw.com/">Virtuoso</a>.</p> +<p>First, let&rsquo;s recap what the benchmark is about:</p> +<ol> +<li> +<p>fairly frequent short updates, with no update contention worth mentioning</p> +</li> +<li> +<p>short random lookups</p> +</li> +<li> +<p>medium complex queries centered around a person&rsquo;s social environment</p> +</li> +</ol> +<p>The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.</p> +<p>The DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, <em>per se,</em> since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.</p> +<p>The workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.</p> +<p>The test system is the same as used in the <a href="http://www.openlinksw.com/weblog/oerling/?id=1739">TPC-H series</a>: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics branch</a> of <a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack, available from www.github.com</a>.</p> +<p>The dataset is the SNB 300G set, with:</p> +<table> +<thead> +<tr> +<th>1,136,127</th> +<th>persons</th> +</tr> +</thead> +<tbody> +<tr> +<td>125,249,604</td> +<td>knows edges</td> +</tr> +<tr> +<td>847,886,644</td> +<td>posts, including replies</td> +</tr> +<tr> +<td>1,145,893,841</td> +<td>tags of posts or replies</td> +</tr> +<tr> +<td>1,140,226,235</td> +<td>likes of posts or replies</td> +</tr> +</tbody> +</table> +<p>As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.</p> +<p>Below are the numerical quantities for a 400K operation run after 150K operations worth of warmup.</p> +<p><strong>Duration:</strong> 10:41.251<br> +<strong>Throughput:</strong> 623.71 (op/s)</p> +<p>The statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.</p> +<table> +<thead> +<tr> +<th>% of total</th> +<th>total_wait</th> +<th>name</th> +<th>count</th> +<th>mean</th> +<th>min</th> +<th>max</th> +</tr> +</thead> +<tbody> +<tr> +<td>20%</td> +<td>4,231,130</td> +<td>LdbcQuery5</td> +<td>656</td> +<td>6,449.89</td> +<td>245</td> +<td>10,311</td> +</tr> +<tr> +<td>11%</td> +<td>2,272,954</td> +<td>LdbcQuery8</td> +<td>18,354</td> +<td>123.84</td> +<td>14</td> +<td>2,240</td> +</tr> +<tr> +<td>10%</td> +<td>2,200,718</td> +<td>LdbcQuery3</td> +<td>388</td> +<td>5,671.95</td> +<td>468</td> +<td>17,368</td> +</tr> +<tr> +<td>7.3%</td> +<td>1,561,382</td> +<td>LdbcQuery14</td> +<td>1,124</td> +<td>1,389.13</td> +<td>4</td> +<td>5,724</td> +</tr> +<tr> +<td>6.7%</td> +<td>1,441,575</td> +<td>LdbcQuery12</td> +<td>1,252</td> +<td>1,151.42</td> +<td>15</td> +<td>3,273</td> +</tr> +<tr> +<td>6.5%</td> +<td>1,396,932</td> +<td>LdbcQuery10</td> +<td>1,252</td> +<td>1,115.76</td> +<td>13</td> +<td>4,743</td> +</tr> +<tr> +<td>5%</td> +<td>1,064,457</td> +<td>LdbcShortQuery3PersonFriends</td> +<td>46,285</td> +<td>22.9979</td> +<td>0</td> +<td>2,287</td> +</tr> +<tr> +<td>4.9%</td> +<td>1,047,536</td> +<td>LdbcShortQuery2PersonPosts</td> +<td>46,285</td> +<td>22.6323</td> +<td>0</td> +<td>2,156</td> +</tr> +<tr> +<td>4.1%</td> +<td>885,102</td> +<td>LdbcQuery6</td> +<td>1,721</td> +<td>514.295</td> +<td>8</td> +<td>5,227</td> +</tr> +<tr> +<td>3.3%</td> +<td>707,901</td> +<td>LdbcQuery1</td> +<td>2,117</td> +<td>334.389</td> +<td>28</td> +<td>3,467</td> +</tr> +<tr> +<td>2.4%</td> +<td>521,738</td> +<td>LdbcQuery4</td> +<td>1,530</td> +<td>341.005</td> +<td>49</td> +<td>2,774</td> +</tr> +<tr> +<td>2.1%</td> +<td>440,197</td> +<td>LdbcShortQuery4MessageContent</td> +<td>46,302</td> +<td>9.50708</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.9%</td> +<td>407,450</td> +<td>LdbcUpdate5AddForumMembership</td> +<td>14,338</td> +<td>28.4175</td> +<td>0</td> +<td>2,008</td> +</tr> +<tr> +<td>1.9%</td> +<td>405,243</td> +<td>LdbcShortQuery7MessageReplies</td> +<td>46,302</td> +<td>8.75217</td> +<td>0</td> +<td>2,112</td> +</tr> +<tr> +<td>1.9%</td> +<td>404,002</td> +<td>LdbcShortQuery6MessageForum</td> +<td>46,302</td> +<td>8.72537</td> +<td>0</td> +<td>1,968</td> +</tr> +<tr> +<td>1.8%</td> +<td>387,044</td> +<td>LdbcUpdate3AddCommentLike</td> +<td>12,659</td> +<td>30.5746</td> +<td>0</td> +<td>2,060</td> +</tr> +<tr> +<td>1.7%</td> +<td>361,290</td> +<td>LdbcShortQuery1PersonProfile</td> +<td>46,285</td> +<td>7.80577</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.6%</td> +<td>334,409</td> +<td>LdbcShortQuery5MessageCreator</td> +<td>46,302</td> +<td>7.22234</td> +<td>0</td> +<td>2,055</td> +</tr> +<tr> +<td>1%</td> +<td>220,740</td> +<td>LdbcQuery2</td> +<td>1,488</td> +<td>148.347</td> +<td>2</td> +<td>2,504</td> +</tr> +<tr> +<td>0.96%</td> +<td>205,910</td> +<td>LdbcQuery7</td> +<td>1,721</td> +<td>119.646</td> +<td>11</td> +<td>2,295</td> +</tr> +<tr> +<td>0.93%</td> +<td>198,971</td> +<td>LdbcUpdate2AddPostLike</td> +<td>5,974</td> +<td>33.3062</td> +<td>0</td> +<td>1,987</td> +</tr> +<tr> +<td>0.88%</td> +<td>189,871</td> +<td>LdbcQuery11</td> +<td>2,294</td> +<td>82.7685</td> +<td>4</td> +<td>2,219</td> +</tr> +<tr> +<td>0.85%</td> +<td>182,964</td> +<td>LdbcQuery13</td> +<td>2,898</td> +<td>63.1346</td> +<td>1</td> +<td>2,201</td> +</tr> +<tr> +<td>0.74%</td> +<td>158,188</td> +<td>LdbcQuery9</td> +<td>78</td> +<td>2,028.05</td> +<td>1,108</td> +<td>4,183</td> +</tr> +<tr> +<td>0.67%</td> +<td>143,457</td> +<td>LdbcUpdate7AddComment</td> +<td>3,986</td> +<td>35.9902</td> +<td>1</td> +<td>1,912</td> +</tr> +<tr> +<td>0.26%</td> +<td>54,947</td> +<td>LdbcUpdate8AddFriendship</td> +<td>571</td> +<td>96.2294</td> +<td>1</td> +<td>988</td> +</tr> +<tr> +<td>0.2%</td> +<td>43,451</td> +<td>LdbcUpdate6AddPost</td> +<td>1,386</td> +<td>31.3499</td> +<td>1</td> +<td>2,060</td> +</tr> +<tr> +<td>0.01%</td> +<td>1,848</td> +<td>LdbcUpdate4AddForum</td> +<td>103</td> +<td>17.9417</td> +<td>1</td> +<td>65</td> +</tr> +<tr> +<td>0.00%</td> +<td>44</td> +<td>LdbcUpdate1AddPerson</td> +<td>2</td> +<td>22</td> +<td>10</td> +<td>34</td> +</tr> +</tbody> +</table> +<p>At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.</p> +<p>The implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.</p> +<p>The set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:</p> +<ul> +<li> +<p><em>Cardinality estimation under heavy data skew —</em> Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.</p> +</li> +<li> +<p><em>Covering indices —</em> Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post&rsquo;s creation date.</p> +</li> +<li> +<p><em>Multi-hop graph traversal —</em> Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.</p> +</li> +<li> +<p><em>Top <em>K</em> —</em> Most queries returning posts order results by descending date. Once there are at least <em>k</em> results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top <em>k</em>.</p> +</li> +<li> +<p><em>Late projection —</em> Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.</p> +</li> +<li> +<p><em>Materialization —</em> Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.</p> +</li> +<li> +<p><em>Concurrency control —</em> Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.</p> +</li> +</ul> +<p>In subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + SNB and Graphs Related Presentations at GRADES '15 + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + Fri, 29 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + <p>Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.</p> +<p>Among the papers published in this edition we have &ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms&rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in <a href="https://github.com/ldbc">https://github.com/ldbc</a>) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have &ldquo;Microblogging Queries on Graph Databases: an Introspection&rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention &ldquo;Frappé: Querying the Linux Kernel Dependency Graph&rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.</p> +<p><a href="http://event.cwi.nl/grades2015/program.shtml">Check the complete agenda.</a></p> +<p>Meet you in Melbourne!</p> + + + + + SNB Interactive Part 2: Modeling Choices + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + Tue, 26 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + <p><a href="https://ldbcouncil.org/benchmarks/snb">​SNB Interactive</a> is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.</p> +<p>In the case of <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server">Virtuoso</a>, we have played with <a href="http://dbpedia.org/resource/SQL">SQL</a> and <a href="http://dbpedia.org/resource/SPARQL">SPARQL</a> implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.</p> +<h3 id="schema-design">Schema Design</h3> +<p>SNB has a regular schema described by a <a href="https://en.wikipedia.org/wiki/Unified_Modeling_Language">UML</a> diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.</p> +<p>The only table-level choice has to do with whether <code>posts</code> and <code>comments</code> are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.</p> +<p>The workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key <code>of ps_creatorid</code>, <code>ps_creationdate</code>, <code>ps_postid</code> pays off since the top-k on <code>creationdate</code> can be pushed down into the index without needing a reference to the table.</p> +<p>The implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the <a href="http://dbpedia.org/resource/DEX_(Graph_database)">Sparksee</a> and <a href="http://dbpedia.org/resource/Neo4j">Neo4J</a> implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.</p> +<p>The benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 &ldquo;top of the wall&rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. <code>person1</code>, <code>person2</code> -&gt; <code>weight</code>. <code>Person1</code> is by convention the one with the smaller <code>p_personid</code>. Note that comparing id&rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI&rsquo;s with disastrous performance implications unless an implementation specific trick were used.</p> +<p>In the next installment we will analyze an actual run.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + Mon, 25 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + <p>LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.</p> +<p>On the industry track, LDBC will be presenting the <em>Social Network Benchmark Interactive Workload</em> by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).</p> +<p>You can read more about the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark here</a> and collaborate if you&rsquo;re interested!</p> +<p>The other presentation will be at the GRADES workshop within the SIGMOD program regarding <em>Graphalytics: A Big Data Benchmark for Graph-Processing platforms</em> by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.</p> +<p>Don&rsquo;t forget to check our presentations if you&rsquo;re attending the SIGMOD!</p> + + + + + SNB Interactive Part 1: What Is SNB Interactive Really About? + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + Thu, 14 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + <p>This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.</p> +<p>With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.</p> +<p>The essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.</p> +<p>So far, we see that SNB confronts the implementor with choices in the following areas:</p> +<ul> +<li>Data model: Relational, RF, property graph?</li> +<li>Physical model, e.g. row-wise vs. column wise storage</li> +<li>Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures</li> +<li>Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers</li> +<li>Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc.</li> +<li>Parameters vs. literals: Sometimes different parameter values result in different optimal query plans</li> +<li>Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload</li> +<li>Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing.</li> +</ul> +<p>In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.</p> +<ul> +<li>Data generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option.</li> +<li>Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce?</li> +<li>Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a &lsquo;fast&rsquo; and &lsquo;slow&rsquo; case of a single query template. How long does one need to run to balance these fluctuations?</li> +<li>Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput.</li> +<li>Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse.</li> +<li>Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable.</li> +<li>Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy?</li> +<li>Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters?</li> +</ul> +<p>The following posts will look at the above in light of actual experience.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + Why Do We Need an LDBC SNB-Specific Workload Driver? + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + Tue, 21 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + <p>In a previous <a href="https://ldbcouncil.org/tags/driver">3-part blog series</a> we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn&rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - &ldquo;graph&rdquo; is the word that best captures the novelty and difficulty of this work.</p> +<p><strong>Workload Execution - Traditional vs Graph</strong></p> +<p>Transactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.</p> +<p>To understand what is meant by &ldquo;traditional relational workloads&rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, &ldquo;dependency&rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator&rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order <em>for a given user</em>. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.</p> +<p>A significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user&rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.</p> +<p>Such scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend&rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. <em>Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.</em></p> +<p><strong>Because it&rsquo;s a graph</strong></p> +<p>In short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.</p> + + + + + Event Driven Post Generation in Datagen + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + Fri, 10 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + <p>As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.</p> +<p>First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:</p> +<ul> +<li> +<p>The wall of the person</p> +</li> +<li> +<p>The albums of the person</p> +</li> +<li> +<p>The groups where the person is a moderator</p> +</li> +</ul> +<p>We will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.</p> +<p>After assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.</p> +<p>Flashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to &ldquo;Enrique Iglesias&rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.</p> +<p>Once the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:</p> +<ul> +<li> +<p>Determine the number of posts to generate</p> +</li> +<li> +<p>Select a random member of the group that will generate the post</p> +</li> +<li> +<p>Determine the event the post will be related to given the aforementioned cumulative distribution</p> +</li> +<li> +<p>Assign the date of the post based on the event date</p> +</li> +</ul> +<p>In order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from <a href="#references">[1]</a>. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.</p> +<p><img src="index.png" alt=""></p> +<p>Following the example of &ldquo;Enrique Iglesias&rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.</p> +<p><img src="index2.png" alt=""></p> +<p>In this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.</p> +<h4 id="references">References</h4> +<p>[1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506</p> + + + + + Sixth TUC Meeting + https://ldbcouncil.org/event/sixth-tuc-meeting/ + Thu, 19 Mar 2015 13:53:33 -0400 + + https://ldbcouncil.org/event/sixth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on <strong>Thursday and Friday March 19/20, 2015.</strong></p> +<p>The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the first benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.</li> +<li>Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>.</p> +<h3 id="agenda">Agenda</h3> +<p><strong>Thursday 19th March</strong></p> +<p>11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)</p> +<p>11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981131.pdf">slides</a></p> +<p>12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)</p> +<p>12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981137.pdf">slides</a></p> +<p>12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain</p> +<p>12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive</p> +<p>13:10 Claudio Martella (VUA): Giraph and Lighthouse</p> +<p>13:30 - 14:30 Lunch break</p> +<p>14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)</p> +<p>14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981132.pdf">slides</a></p> +<p>14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981133.pdf">slides</a></p> +<p>15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981139.pdf">slides</a></p> +<p>15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs</p> +<p>18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.</p> +<p>20:00 Social dinner at <a href="http://www.bastaix.com">Bastaix Restaurant</a>.</p> +<p><strong>Friday 20th March</strong></p> +<p>9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)</p> +<p>9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics</p> +<p>9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms</p> +<p>10:10 John Snelson (MarkLogic): Introduction to MarkLogic</p> +<p>10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload</p> +<p>10:50 Moritz Kaufmann. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/moritz-kaufmann-ldbc-snb-benchmark-auditing-6th-ldbc-tuc.pdf">The auditing experience</a></p> +<p>11:15 - 11:45 Coffee break</p> +<p>11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)</p> +<p>11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox</p> +<p>12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data</p> +<p>12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments</p> +<p>12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981138.pdf">slides</a></p> +<p>13:30 - 14:30 Lunch break</p> +<p>15:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>19th and 20th March 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held at &ldquo;Aula Master&rdquo; at A3 building located inside the &ldquo;Campus Nord UPC&rdquo; in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h5 id="maps-and-situation"><strong>Maps and situation</strong></h5> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<h5 id="finding-upc"><strong>Finding UPC</strong></h5> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<h5 id="finding-the-meeting-room"><strong>Finding the meeting room</strong></h5> +<h5 id="getting-there">Getting there</h5> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to<br> +the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<h5 id="the-locations-of-the-airport-and-the-city-centre"><strong>The locations of the airport and the city centre</strong></h5> + + + + + The LDBC Datagen Community Structure + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + Sun, 15 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + <p>This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.</p> +<p>When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.</p> +<p>The first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 <a href="#references">[1]</a>. Here we summarize the paper and its contributions and findings.</p> +<p>Existing synthetic graph generators such as Rmat <a href="#references">[1]</a> and Mag <a href="#references">[2]</a>, are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR <a href="#references">[3]</a>, a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.</p> +<p>However, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of <a href="#references">[1]</a> set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to <a href="#references">[4]</a>.</p> +<table> +<thead> +<tr> +<th></th> +<th><em>Nodes</em></th> +<th><em>Edges</em></th> +</tr> +</thead> +<tbody> +<tr> +<td><em>Amazon</em></td> +<td>334863</td> +<td>925872</td> +</tr> +<tr> +<td><em>Dblp</em></td> +<td>317080</td> +<td>1049866</td> +</tr> +<tr> +<td><em>Youtube</em></td> +<td>1134890</td> +<td>2987624</td> +</tr> +<tr> +<td><em>Livejournal</em></td> +<td>3997962</td> +<td>34681189</td> +</tr> +</tbody> +</table> +<p>The authors of <a href="#references">[1]</a> selected a set of statistical indicators to<br> +characterize the communities:</p> +<ul> +<li>The clustering coefficient</li> +<li>The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community.</li> +<li>The bridge ratio, which is the ratio of edges whose removal disconnects the community.</li> +<li>The diameter</li> +<li>The conductance</li> +<li>The size</li> +</ul> +<p>The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.</p> +<ul> +<li>Many of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community.</li> +<li>Most of the communities are small (10 or less nodes).</li> +<li>In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index.png" alt=""></td> +<td style="text-align:center"><img src="index2.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index3.png" alt=""></td> +<td style="text-align:center"><img src="index4.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">Diameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index5.png" alt=""></td> +<td style="text-align:center"><img src="index6.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index8.png" alt=""></td> +<td style="text-align:center"><img src="index9.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index10.png" alt=""></td> +<td style="text-align:center"><img src="index11.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index11.png" alt=""></td> +<td style="text-align:center"><img src="index12.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The main conclusions that can be extracted from DATAGEN can be summarized asfollows:</p> +<ul> +<li>DATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio.</li> +<li>The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs.</li> +<li>Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger.</li> +<li>The diameter is small like in the real graphs.</li> +<li>It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics.</li> +</ul> +<p>Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:</p> +<ul> +<li>LFR graphs donot show the multimodal distribution observed in real graphs</li> +<li>Only the diameter shows a similar shape as in the ground truth.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index13.png" alt=""></td> +<td style="text-align:center"><img src="index14.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index15.png" alt=""></td> +<td style="text-align:center"><img src="index16.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index17.png" alt=""></td> +<td style="text-align:center"><img src="index18.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman&rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index19.png" alt=""></td> +<td style="text-align:center"><img src="index20.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index21.png" alt=""></td> +<td style="text-align:center"><img src="index22.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index23.png" alt=""></td> +<td style="text-align:center"><img src="index24.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!</p> +<h4 id="references">References</h4> +<p>[1] Arnau Prat-Pérez, <a href="http://dblp.uni-trier.de/pers/hd/d/Dom=iacute=nguez=Sal:David">David Domínguez-Sal</a>: How community-like is the structure of synthetically generated graphs? <a href="http://dblp.uni-trier.de/db/conf/sigmod/grades2014.html#PratD14">GRADES 2014</a></p> +<p>[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014</p> +<p>[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics</p> +<p>[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.</p> + + + + + Industry Relevance of the Semantic Publishing Benchmark + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + Tue, 03 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + <h3 id="publishing-and-media-businesses-are-going-through-transformation">Publishing and media businesses are going through transformation</h3> +<p>I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.</p> +<p>What’s driving this change in publishing and media?</p> +<ul> +<li> +<p>Widespread and instantaneous distribution of information over the Internet has turned news into somewhat of a &ldquo;commodity&rdquo; and few people are willing to pay for it</p> +</li> +<li> +<p>The wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;</p> +</li> +<li> +<p>Open access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.</p> +</li> +</ul> +<p><em>Alongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.</em></p> +<h3 id="imagine-instant-news-in-context-imagine-personal-channels-imagine--triplestores">Imagine instant news in context, Imagine personal channels, Imagine &hellip; triplestores</h3> +<p>While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in &ldquo;near real time&rdquo; is difficult and expensive using legacy processes and content management technology.</p> +<p>Another example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like <a href="http://new.dowjones.com/products/factiva/">Factiva</a>, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.</p> +<p>Finally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.</p> +<p>Many publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:</p> +<ol> +<li>Generation of rich and &ldquo;meaningful&rdquo; (trying not to use &ldquo;semantic&rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery.</li> +</ol> +<p>In this post I write about &ldquo;semantic annotation&rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.</p> +<h3 id="semantic-annotation-produces-rich-metadata-descriptions--the-fuel-for-semantic-publishing">Semantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing</h3> +<p>The most popular meaning of &ldquo;semantic annotation&rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.</p> +<p><img src="02_semantic_repository.png" alt=""></p> +<p>The concept of using <a href="http://infosys3.elfak.ni.ac.rs/nastava/attach/SemantickiWebKurs/sdarticle.pdf">text-mining for automatic semantic annotation</a> of text with respect to very large datasets, such as <a href="http://dbpedia.org/">DBPedia</a>, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether &ldquo;Paris&rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether &ldquo;Hilton&rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.</p> +<p>Today there are plenty of tools (such as the <a href="https://www.ontotext.com/semantic-solutions/media-publishing/">Ontotext Media and Publishing</a> platform and <a href="https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki">DBPedia Spotlight</a>) and services (such as Thomson Reuter’s <a href="http://www.opencalais.com/">OpenCalais</a> and Ontotext’s <a href="http://s4.ontotext.com">S4</a>) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.</p> +<h3 id="the-bbcs-dynamic-semantic-publishing-dsp">The BBC’s Dynamic Semantic Publishing (DSP)</h3> +<p>Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.</p> +<p>BBC Future Media &amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.</p> +<p><em>&ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories&rdquo;.</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">Jem Rayfield, Senior Technical Architect</a>, BBC News and Knowledge</p> +<p>The Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (&ldquo;edited by exception&rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.</p> +<p><em>&ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#language">natural language and ontological determiner process</a> automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.</em></p> +<p><img src="03_bbc_sport.png" alt=""></p> +<p><em>Journalist-published metadata is captured and made persistent for querying using the resource description framework (<a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#RDF"><em>RDF</em></a>) metadata representation and triple store technology. <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#BigOWLIM">A RDF triplestore</a> and <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#SPARQL">SPARQL</a> approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept &ldquo;Frank Lampard&rdquo;, then the framework infers and applies concepts such as &ldquo;England Squad&rdquo;, &ldquo;Group C&rdquo; and &ldquo;FIFA World Cup 2010&rdquo; &hellip;&rdquo;</em> &ndash; Jem Rayfield</p> +<p>One can consider each of the &ldquo;aggregation pages&rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.</p> +<p><em>&ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content</em></p> +<p><strong>…</strong><strong><em>we are not publishing pages, but publishing content</em></strong> <em>as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.</em></p> +<p><img src="04_content_tagging.png" alt=""></p> +<p><em>… The index pages are published automatically. This process is what assures us of the highest quality output, but still <strong>save large amounts of time</strong> in managing the site and <strong>makes it possible for us to efficiently run so many pages</strong> for the World Cup.&rdquo;</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/the_world_cup_and_a_call_to_ac.html">John O&rsquo;Donovan, Chief Technical Architect, BBC Future Media &amp; Technology</a></p> +<p>To get a real feeling about the load of the triplestore behind BBC&rsquo;s World Cup web site, here are some statistics:</p> +<ul> +<li> +<p>800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;</p> +</li> +<li> +<p>Average unique page requests/day: 2 million;</p> +</li> +<li> +<p>Average <strong>SPARQL queries/day: 1 million;</strong></p> +</li> +<li> +<p><strong>100s repository updates/inserts per minute</strong> with OWL 2 RL reasoning;</p> +</li> +<li> +<p>Multi data center that is fully resilient, clustered 6 node triplestore.</p> +</li> +</ul> +<h3 id="the-semantic-publishing-benchmark">The Semantic Publishing Benchmark</h3> +<p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).</p> +<p>SPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:</p> +<ul> +<li> +<p><em>Reference knowledge:</em> taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);</p> +</li> +<li> +<p><em>Metadata</em> for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.</p> +</li> +</ul> +<p>In this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:</p> +<ul> +<li> +<p><em>Aggregation queries</em> retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;</p> +</li> +<li> +<p><em>Updates</em>, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.</p> +</li> +</ul> +<p>SPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from <a href="http://www.geonames.org/">Geonames</a> for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.</p> +<p>A more technical introduction to SPB can be found in this <a href="https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark">post</a>. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">post</a>. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.</p> +<p>Despite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of &ldquo;fast flowing&rdquo; content need to be &ldquo;dispatched&rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:</p> +<ul> +<li> +<p>The Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;</p> +</li> +<li> +<p>Reasoning is needed to map content descriptions to queries in a flexible manner;</p> +</li> +<li> +<p>There are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.</p> +</li> +</ul> +<h3 id="spb-v20--steeper-for-the-engines-closer-to-the-publishers">SPB v.2.0 – steeper for the engines, closer to the publishers</h3> +<p>We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.</p> +<p>The major changes in SPB v.2.0 can be summarized as follows:</p> +<ul> +<li> +<p>Much bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;</p> +</li> +<li> +<p>Better interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;</p> +</li> +<li> +<p>Retrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.</p> +</li> +</ul> + + + + + OWL-Empowered SPARQL Query Optimization + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + Wed, 18 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + <p>The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.</p> +<p>In this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.</p> +<p>OWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.</p> +<p>This richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.</p> +<p>The aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found <a href="LDBC_D4.4.2_final.pdf">here</a>.</p> +<h3 id="schema-based-optimization-techniques">Schema-Based Optimization Techniques</h3> +<p>Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.</p> +<p>A simple first case is the case of constraint violation. Consider the query below, which returns all instances of class <code>&lt;A&gt;</code> which are fillers of a specific property <code>&lt;P&gt;</code>. If the underlying schema contains the information that the range of <code>&lt;P&gt;</code> is class <code>&lt;B&gt;</code>, and that class <code>&lt;B&gt;</code> is disjoint from class <code>&lt;A&gt;</code>, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.</p> +<pre tabindex="0"><code>SELECT ?v +WHERE { ?v rdf : type &lt;A&gt; . + ?u &lt;P&gt; ?v . ?u &lt;P&gt; ?v1 . + ?u &lt;P1 &gt; ?v2 . ?u &lt;P2 &gt; ?v3 . + ?u &lt;P3 &gt; ?v4 . ?u &lt;P4 &gt; ?v5} +</code></pre><p>Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class <code>&lt;A1&gt;</code> can immediately be identified as not being in the answer set.</p> +<pre tabindex="0"><code>SELECT ?c +WHERE { ?x rdf: type ?c . ?x &lt;P&gt; ?y . + FILTER NOT EXISTS \{ ?x rdf: type &lt;A1 &gt; }} +</code></pre><p>Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.</p> +<p>Similarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).</p> +<p>As an example of these patterns, consider the query below, where class <code>&lt;C&gt;</code> is defined as the intersection of classes <code>&lt;C1&gt;</code>,<code> &lt;C2&gt;</code>. Thus, the triple pattern <code>(?x rdf:type &lt;C&gt;)</code> is more selective than <code>(?y rdf:type &lt;C1&gt;)</code> and <code>(?z rdf:type &lt;C2&gt;)</code> and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.</p> +<pre tabindex="0"><code>SELECT ?x +WHERE { ?x rdf: type &lt;C&gt; . ?x &lt;P1 &gt; ?y . + ?y rdf : type &lt;C1 &gt; . ?y &lt;P2 &gt; ?z . ?z rdf : type &lt;C2 &gt; } +</code></pre><p><img src="owl_constraints.png" alt="image"></p> +<p>Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property <code>P4</code> is a symmetric property.</p> +<pre tabindex="0"><code>SELECT ?y ?y1 ?y2 ?y3 +WHERE { ?x &lt;P1 &gt; ?y . ?x &lt;P2 &gt; ?y1 . + ?x &lt;P3 &gt; ?y2 . ?y3 &lt;P4 &gt; ?x } +</code></pre><h3 id="conclusion">Conclusion</h3> +<p>In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:</p> +<ul> +<li>Cases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results.</li> +<li>Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing.</li> +<li>Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query.</li> +<li>Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques.</li> +</ul> +<p>This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.</p> + + + + + Person Activity Subgraph Features in LDBC DATAGEN + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + Wed, 04 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + <p>When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.</p> +<p>When looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.</p> +<h3 id="reaslistic-message-content">Reaslistic Message Content</h3> +<p>Messages&rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post&rsquo;s tags to other related or randomly selected tags.</p> +<h3 id="non-uniform-activity-levels">Non uniform activity levels</h3> +<p>In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.</p> +<h3 id="time-correlated-post-and-comment-generation">Time correlated post and comment generation</h3> +<p>In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in <a href="#references">[1]</a>. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.</p> +<p><img src="1.png" alt="image"></p> +<p>As we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.</p> +<p>In this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.</p> +<h4 id="references">References</h4> +<p>[1] Leskovec, J., Backstrom, L., &amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In <em>Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 497-506). ACM.</p> + + + + + SNB Driver - Part 2: Tracking Dependencies Between Queries + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + Fri, 23 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + <p>The <a href="https://ldbcouncil.org/post/snb-driver-part-1">SNB Driver part 1</a> post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we&rsquo;ll drill down deeper into the details of what it means to execute &ldquo;dependent queries&rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.</p> +<h3 id="definitions">Definitions</h3> +<ul> +<li> +<p><em>Simulation Time (ST)</em>: notion of time created by data generator. All time stamps in the generated data set are in simulation time</p> +</li> +<li> +<p><em>Real Time (RT)</em>: wall clock time</p> +</li> +<li> +<p><em>Time Compression Ratio</em>: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark</p> +</li> +<li> +<p><em>Operation</em>: read and/or write</p> +</li> +<li> +<p><em>Dependencies</em>: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed</p> +</li> +<li> +<p><em>Dependents</em>: operations in this set are dependent on at least one other operation (in Dependencies) in the workload</p> +</li> +<li> +<p><em>Due Time (DueT)</em>: point in simulation time at which the execution of an operation should be initiated.</p> +</li> +<li> +<p><em>Dependent Time (DepT)</em>: in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.</p> +</li> +<li> +<p><em>Safe Time (SafeT)</em>: time duration.</p> +<ul> +<li> +<p>when two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them</p> +</li> +<li> +<p>SafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents</p> +</li> +</ul> +</li> +<li> +<p>​<em>Operation Stream</em>: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)</p> +</li> +<li> +<p><em>Initiated Operations</em>: operations that have started executing but not yet finished</p> +</li> +<li> +<p><em>Local Completion Time (per driver)</em>: point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))</p> +</li> +<li> +<p><em>Global Completion Time (GCT)</em>: minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​</p> +</li> +<li> +<p><em>Execution Window (Window)</em>: a timespan within which all operations can be safely executed</p> +<ul> +<li> +<p>All operations satisfying window.startTime &lt;= operation.DueT &lt; window.endTime may be executed</p> +</li> +<li> +<p>Within a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window</p> +</li> +<li> +<p>To ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 &lt; window.duration &lt;= SafeT</p> +</li> +<li> +<p>Window duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable</p> +</li> +<li> +<p>Before any operations within a window can start executing it is required that: GCT &gt;= window.startTime - (SafeT - window.duration)</p> +</li> +<li> +<p>All operations within a window must initiate and complete between window start and end times: window.startTime &lt;= operation.initiate &lt; window.endTime and window.startTime &lt;= operation.complete &lt; window.endTime</p> +</li> +</ul> +</li> +<li> +<p><em>Dependency Mode</em>: defines dependencies, constraints on operation execution order</p> +</li> +<li> +<p><em>Execution Mode</em>: defines how the runtime should execute operations of a given type</p> +</li> +</ul> +<h3 id="tracking-dependencies">Tracking Dependencies</h3> +<p>Now, the fun part, making sure dependent operations are executed in the correct order.</p> +<p>Consider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).</p> +<p>Logically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:</p> +<ul> +<li>the set of operations that have started executing but not yet finished.</li> +</ul> +<p>Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:</p> +<ul> +<li>the set of operations that have started and finished executing.</li> +</ul> +<p>Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):</p> +<ul> +<li>the point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time.</li> +</ul> +<p>LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:</p> +<ul> +<li> +<p><em>Due Time</em>: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled</p> +</li> +<li> +<p><em>GCT</em>: every operation (from Dependencies) with a Due Time before this point in time has completed execution</p> +</li> +</ul> +<p>However, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:</p> +<ul> +<li>in addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute.</li> +</ul> +<p>Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.</p> +<h3 id="scalable-execution-in-the-presence-of-dependencies">Scalable execution in the Presence of Dependencies</h3> +<p>The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:</p> +<p>a) make the generated load less &lsquo;bursty&rsquo;</p> +<p>b) allow the driver to &lsquo;scale&rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.</p> +<p>In the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.</p> +<p>The reasons this approach is correct are as follows:</p> +<ul> +<li> +<p>Operations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked</p> +</li> +<li> +<p>The minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)</p> +</li> +<li> +<p>A window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until</p> +<p>GCT &gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT</p> +</li> +</ul> +<p>The advantages of such an execution mode are as follows:</p> +<ul> +<li> +<p>As no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window</p> +</li> +<li> +<p>Then, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic</p> +</li> +<li> +<p>Further, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to &lsquo;smoothen&rsquo; the load within a Window.</p> +</li> +</ul> +<p>As with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between &lsquo;workload resolution&rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the &lsquo;bursty&rsquo; part of the event stream.</p> +<p><img src="window-scheduling.png" alt="image"><br> +Figure 1. Window scheduling</p> +<p>This design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - &lsquo;what happens in the window stays in the window&rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.</p> + + + + + SNB Driver - Part 3: Workload Execution Putting It All Together + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + Tue, 20 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + <p>Up until now we have introduced the <a href="https://ldbcouncil.org/post/snb-driver-part-1">challenges faced when executing the LDBC SNB benchmark</a>, as well as explained <a href="https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries">how some of these are overcome</a>. With the foundations laid, we can now explain precisely how operations are executed.</p> +<p>Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.</p> +<h3 id="dependency-modes">Dependency Modes</h3> +<p>While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.</p> +<p>Another way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:</p> +<p><strong>• None</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>– Prior Execution: do nothing – After Execution: do nothing</p> +<p><strong>• Read Only</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: wait for GCT &gt;= operation.DepTime – After Execution: do nothing</p> +<p><strong>• Write Only</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<p><strong>• Read Write</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations, wait for GCT &lt; operation.DepT</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<h3 id="execution-modes">Execution Modes</h3> +<p>Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.</p> +<p><strong>• Asynchronous</strong>: operations are executed individually, when their Due Time arrives.</p> +<p>Motivation: This is the default execution mode, it executes operations as true to the workload definition as possible.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: unbounded</p> +<p>– Max Execution Time: unbounded</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay</p> +<p><strong>• Synchronous</strong>: operations are executed individually, sequentially, in blocking manner.</p> +<p>Motivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.</p> +<p>The alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler</p> +<p>– Execute When time &gt;= operation.DueT and previousOperation.completed == true (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: 1</p> +<p>– Max Execution Time: nextOperation.DueT - operation.DueT</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay</p> +<p><strong>• Partially Synchronous</strong> (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together</p> +<p>– Re-scheduling Before Execution: Yes, as long as the following still holds:</p> +<p>window.startTime &lt;= operation.DueT &lt; window.startTime + window.duration</p> +<p>Operations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified</p> +<p>– Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: number of operations within window</p> +<p>– Max Execution Time: (window.startTime + window.duration) - operation.DueT</p> +<p>– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration</p> +<h3 id="tying-it-back-to-ldbc-snb">Tying it back to LDBC SNB</h3> +<p>The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.</p> +<p>The LDBC SNB data can in fact be seen as a union of two parts:</p> +<ol> +<li> +<p>Core Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.</p> +</li> +<li> +<p>User Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the &ldquo;core&rdquo; part are satisfied (i.e., users don&rsquo;t post things before the profiles are created, etc.).</p> +</li> +</ol> +<p>In order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.</p> +<p>This property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.</p> +<p>On the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).</p> + + + + + Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + Tue, 13 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + <p>Until now we have discussed several aspects of the <a href="https://ldbcouncil.org/benchmarks/spb">Semantic Publishing Benchmark (SPB)</a> such as the <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">difference in performance between virtual and real servers configuration</a>, how to choose an <a href="https://ldbcouncil.org/post/making-semantic-publishing-execution-rules">appropriate query mix</a> for a benchmark run and our experience with using SPB in the development process of GraphDB for <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">finding performance issues</a>.</p> +<p>In this post we provide a step-by-step guide on how to run SPB using the <a href="http://rdf4j.org/">Sesame</a> RDF data store on a fresh install of <a href="http://releases.ubuntu.com/14.04.1/">Ubuntu Server 14.04.1</a>. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.</p> +<h3 id="prerequisites">Prerequisites</h3> +<p>We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:</p> +<ul> +<li>Git</li> +<li>Apache Ant 1.8 or higher</li> +<li>OpenJDK 6 or Oracle JDK 6 or higher</li> +<li>Apache Tomcat 7 or higher</li> +</ul> +<p>If you already have these components installed on your machine you can directly proceed to the next section: <em>Installing Sesame</em></p> +<p>Following are sample commands which can be used to install the required software components:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo apt-get install git +</span></span><span style="display:flex;"><span>sudo apt-get install ant +</span></span><span style="display:flex;"><span>sudo apt-get install default-jdk +</span></span><span style="display:flex;"><span>sudo apt-get install tomcat7 +</span></span></code></pre></div><p>Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.</p> +<p>After a successful installation of Apache Tomcat you should be able to get the default splash page <em>“It works”</em> when you open your web browser and enter the following address: http://&lt;your_ip_address&gt;:8080</p> +<h3 id="installing-sesame">Installing Sesame</h3> +<p>We will use current Sesame version 2.7.14. You can download it <a href="http://sourceforge.net/projects/sesame/files/Sesame%202/">here</a> or run following command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>wget <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download&#34;</span> <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> -O openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>Then extract the Sesame tarball:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>tar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>To deploy sesame you have to copy the two war files that are in <em>openrdf-sesame-2.7.14/war</em> to <em>/var/lib/tomcat7/webapps</em></p> +<p>From <em>openrdf-sesame-2.7.14/war</em> you can do it with command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>cp openrdf-*.war &lt;tomcat_install&gt;/webapps +</span></span></code></pre></div><p>Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.</p> +<p>By default the configuration directory is: <em>/usr/share/tomcat7/.aduna</em></p> +<p>Create the directory:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo mkdir /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Then change the ownership:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chown tomcat7 /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>And finally you should give the necessary permissions:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chmod o+rwx /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Now when you go to: http://&lt;your_ip_address&gt;:8080/openrdf-workbench/repositories</p> +<p>You should get a screen like this:</p> +<p><img src="01-Sesame-repo-list.png" alt="image"></p> +<h3 id="setup-spb">Setup SPB</h3> +<p>You can download the SPB code and find brief documentation on GitHub:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm">https://github.com/ldbc/ldbc_spb_bm</a></p> +<p>A detailed documentation is located here:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf">https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf</a></p> +<p>SPB offers many configuration options which control various features of the benchmark e.g.:</p> +<ul> +<li>query mixes</li> +<li>dataset size</li> +<li>loading datasets</li> +<li>number of agents</li> +<li>validating results</li> +<li>test conformance to OWL2-RL ruleset</li> +<li>update rate of agents</li> +</ul> +<p>Here we demonstrate how to generate a dataset and execute a simple test<br> +run with it.</p> +<p>First download the SPB source code from the repository:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>git clone https://github.com/ldbc/ldbc_spb_bm.git +</span></span></code></pre></div><p>Then in the ldbc_spb_bm directory build the project:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>ant build-basic-querymix +</span></span></code></pre></div><p>If you simply execute the command:</p> +<pre tabindex="0"><code>ant +</code></pre><p>you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.</p> +<p>Depending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat&rsquo;s startup files e.g. in <em>catalina.sh</em>:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>export JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;-d64 -Xmx4G&#34;</span> +</span></span></code></pre></div><p>To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:</p> +<p><img src="02-Sesame-create-repo.png" alt="image"></p> +<p>Then we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in <em>ldbc_spb_bm/dist/test.properties</em> file.</p> +<p>The default value of <em>datasetSize</em> in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.</p> +<p>You need to change</p> +<pre tabindex="0"><code>datasetSize=1000000 +</code></pre><p>Also the URLs of the SPARQL endpoint for the repository</p> +<pre tabindex="0"><code>endpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 +endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements +</code></pre><p>First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.</p> +<p>These are the settings to do that, following parameters will &lsquo;instruct&rsquo; the SPB test driver to perform all the actions described above:</p> +<pre tabindex="0"><code>#Benchmark Operational Phases +loadOntologies=true +loadReferenceDatasets=true +generateCreativeWorks=true +loadCreativeWorks=true +generateQuerySubstitutionParameters=true +validateQueryResults=false +warmUp=false +runBenchmark=false +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>To run the benchmark execute the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>java -jar semantic_publishing_benchmark-basic-standard.jar +</span></span><span style="display:flex;"><span>test.properties +</span></span></code></pre></div><p>When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.</p> +<p>Next we will measure the performance of Sesame Data Store by changing some configuration properties:</p> +<pre tabindex="0"><code>#Benchmark Configuration Parameters +warmupPeriodSeconds=60 +benchmarkRunPeriodSeconds=300 +... +#Benchmark Operational Phases +loadOntologies=false +loadReferenceDatasets=false +generateCreativeWorks=false +loadCreativeWorks=false +generateQuerySubstitutionParameters=false +validateQueryResults=false +warmUp=true +runBenchmark=true +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>After the benchmark test run has finished result files are saved in folder: <em>dist/logs</em></p> +<p>There you will find three types of results: the result summary of the benchmark run (<em>semantic_publishing_benchmark_results.log),</em> brief results and detailed results.</p> +<p>In <em>semantic_publishing_benchmark_results.log</em> you will find the results distributed per seconds. They should be similar to the listing bellow:</p> +<p>Benchmark Results for the 300-th second</p> +<pre tabindex="0"><code>Seconds : 300 (completed query mixes : 0) + Editorial: + 2 agents + + 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) + 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) + 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) + + 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) + 0.0300 average operations per second + + Aggregation: + 8 agents + + 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) + 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) + 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) + 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) + 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) + 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) + 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) + 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) + 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) +</code></pre><p>This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.</p> +<p>If you have any troubles running the benchmark, don&rsquo;t hesitate to comment or use our social media channels.</p> +<p>In a future post we will go through some of the parameters of SPB and check their performance implications.</p> + + + + + Semantic Publishing Instance Matching Benchmark + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + Tue, 30 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + <p>The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.</p> +<p>The SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:</p> +<ul> +<li>value-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity</li> +<li>structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation)</li> +</ul> +<p>The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:</p> +<ul> +<li>instance (in)equality (owl:sameAs, owl:differentFrom)</li> +<li>class and property equivalence (owl:equivalentClass, owl:equivalentProperty)</li> +<li>class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties)</li> +<li>class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf)</li> +<li>property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty)</li> +<li>complex class definitions (owl:unionOf, owl:intersectionOf)</li> +</ul> +<p>SPIMBench uses and extends the ontologies of LDBC&rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB&rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.</p> +<p>Value and structure-based test cases are implemented using the SWING framework <a href="#references">[1]</a> on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.</p> +<p>SPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.</p> +<p>SPIMBench can be downloaded from <a href="https://github.com/jsaveta/SPIMBench">our repository</a> and a more thorough description thereof can be found on <a href="http://www.ics.forth.gr/isl/spimbench/">http://www.ics.forth.gr/isl/spimbench/</a>.</p> +<h4 id="references">References</h4> +<p>[1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + Further Developments in SNB BI Workload + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + Thu, 18 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + <p>We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.</p> +<p>As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.</p> +<p>There are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.</p> +<p>Let’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.</p> +<p>One of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.</p> +<p>Another feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.</p> +<p>Since a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.</p> +<p>An orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.</p> +<p>So, let’s see how some of these aspects could be captured in the SNB context.</p> +<p>Geography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.</p> +<p>The communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -&gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.</p> +<p>The behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.</p> +<p>In John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”</p> +<p>Analytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.</p> +<p>From a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.</p> +<p>This leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.</p> +<p>There is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.</p> + + + + + Sizing AWS Instances for the Semantic Publishing Benchmark + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + Wed, 17 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + <p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">BBC Dynamic Semantic Publishing</a> scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">wrote earlier</a>, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.</p> +<p>Lately we tested different Amazon Web Services (<a href="https://aws.amazon.com/">AWS</a>) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that <a href="https://www.ontotext.com/products/ontotext-graphdb/">GraphDB</a> experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.</p> +<h3 id="the-experiment">The Experiment</h3> +<p>For our tests we use:</p> +<ul> +<li>GraphDB Standard v6.1</li> +<li>LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: +<ul> +<li>8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also</li> +<li>50M dataset (SF1)</li> +<li>40 minutes of benchmark run time (60 seconds of warm up)</li> +</ul> +</li> +<li>5 different Amazon EC2 instances and one local server</li> +</ul> +<p>Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.</p> +<p>We use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.</p> +<p>We also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.</p> +<h3 id="the-results">The Results</h3> +<p>For the tests we measured:</p> +<ul> +<li><em>queries/s</em> for the read threads, where queries include SELECT and CONSTRUCT</li> +<li><em>updates/s</em> for the write threads, where an update operation is INSERT or DELETE</li> +<li><em>queries/$</em> and <em>updates/$</em> – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput</li> +<li><em>update/vCPU</em> – modification operations per vCPU per second</li> +</ul> +<p>Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.</p> +<p>The improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.</p> +<p>Table 1. SPB Measurement Results on AWS and Local Servers</p> +<table> +<thead> +<tr> +<th>Server Type</th> +<th>vCPUs</th> +<th>R/W Agents</th> +<th>RAM (GB)</th> +<th>&ldquo;Storage (GB, SSD)&rdquo;</th> +<th>Price USD/h</th> +<th>Queries/ sec.</th> +<th>Updates/ sec.</th> +<th>Queries/ USD</th> +<th>Updates/ USD</th> +<th>Updates/ vCPU</th> +</tr> +</thead> +<tbody> +<tr> +<td>m3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>15</td> +<td>2x 40</td> +<td>0.28</td> +<td>8.39</td> +<td>8.23</td> +<td>107 882</td> +<td>105 873</td> +<td>2.06</td> +</tr> +<tr> +<td>m3.2xlarge</td> +<td>8</td> +<td>8/2</td> +<td>30</td> +<td>2x 80</td> +<td>0.56</td> +<td>15.44</td> +<td>15.67</td> +<td>99 282</td> +<td>100 752</td> +<td>1.96</td> +</tr> +<tr> +<td>c3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>7.5</td> +<td>2x 40</td> +<td>0.21</td> +<td>7.17</td> +<td>6.78</td> +<td>122 890</td> +<td>116 292</td> +<td>1.7</td> +</tr> +<tr> +<td><strong>c3.2xlarge</strong></td> +<td><strong>8</strong></td> +<td><strong>8/2</strong></td> +<td><strong>15</strong></td> +<td><strong>2x 80</strong></td> +<td><strong>0.42</strong></td> +<td><strong>16.46</strong></td> +<td><strong>14.56</strong></td> +<td><strong>141 107</strong></td> +<td><strong>124 839</strong></td> +<td><strong>1.82</strong></td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>8/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>23.23</strong></td> +<td><strong>21.17</strong></td> +<td><strong>99 578</strong></td> +<td><strong>90 736</strong></td> +<td><strong>1.32</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>8/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>22.89</td> +<td>20.39</td> +<td>98 100</td> +<td>87 386</td> +<td>1.27</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/2</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.6</td> +<td>19.11</td> +<td>114 000</td> +<td>81 900</td> +<td>1.19</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.19</td> +<td>19.18</td> +<td>112 243</td> +<td>82 200</td> +<td>1.2</td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>14/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>30.84</strong></td> +<td><strong>16.88</strong></td> +<td><strong>132 171</strong></td> +<td><strong>72 343</strong></td> +<td><strong>1.06</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>14/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>29.67</td> +<td>17.8</td> +<td>127 157</td> +<td>76 286</td> +<td>1.11</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.11</td> +<td>32.04</td> +<td>156 712</td> +<td>135 302</td> +<td>1</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.31</td> +<td>32.07</td> +<td>157 557</td> +<td>135 429</td> +<td>1</td> +</tr> +<tr> +<td><strong>Local</strong></td> +<td><strong>32</strong></td> +<td><strong>10/2</strong></td> +<td><strong>256</strong></td> +<td><strong>8x 256</strong></td> +<td><strong>0.85</strong></td> +<td><strong>40</strong></td> +<td><strong>31.01</strong></td> +<td><strong>168 916</strong></td> +<td><strong>130 952</strong></td> +<td><strong>0.97</strong></td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.39</td> +<td>26.42</td> +<td>153 672</td> +<td>111 569</td> +<td>0.83</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.22</td> +<td>26.39</td> +<td>152 954</td> +<td>111 443</td> +<td>0.82</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>20/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>34.59</td> +<td>23.86</td> +<td>146 070</td> +<td>100 759</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<h3 id="the-optimal-number-of-test-agents">The Optimal Number of Test Agents</h3> +<p>Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:</p> +<ul> +<li>There is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads;</li> +<li>Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17);</li> +<li>Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations;</li> +<li>For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server;</li> +<li>Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization.</li> +<li>There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates.</li> +</ul> +<h3 id="efficiency-and-cost">Efficiency and Cost</h3> +<p>AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.</p> +<p>Cloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.</p> +<p>$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!</p> +<p>The full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.</p> +<p>To put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:</p> +<ul> +<li><strong>100 queries/sec.</strong> handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events.</li> +<li><strong>10 updates/sec.</strong> - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour.</li> +<li><strong>$81/day</strong> is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times.</li> +</ul> + + + + + DATAGEN: a Realistic Social Network Data Generator + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + Sat, 06 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/getting-started-with-snb">Getting started with snb</a>, <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">DATAGEN: data generation for the Social Network Benchmark</a>), Arnau Prat discussed the main features and characteristics of DATAGEN: <em>realism</em>, <em>scalability</em>, <em>determinism</em>, <em>usability</em>. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the <a href="https://github.com/ldbc/ldbc_snb_datagen">instructions for generating a SNB dataset</a> and <a href="https://github.com/ldbc/ldbc_snb_implementations/tree/master/interactive/virtuoso">for loading the dataset into Virtuoso</a>. In the following sections, we analyze several aspects of the generated dataset.</p> +<h3 id="a-realistic-social-graph">A Realistic social graph</h3> +<p>One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their <em><knows></em> relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about <a href="https://www.facebook.com/notes/facebook-data-team/anatomy-of-facebook/10150388519243859">Facebook Anatomy</a>). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook&rsquo;s graph distribution.</p> +<p><img src="Cumulative-distribution.png" alt="image"> <br> +Figure 1: Cumulative distribution #friends per user</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-r" data-lang="r"><span style="display:flex;"><span><span style="color:#75715e">#R script for generating the social degree distribution </span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">#Input files: person_knows_person_*.csv</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(data.table) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(igraph) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(plotrix) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">require</span>(bit64) +</span></span><span style="display:flex;"><span>dflist <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">lapply</span>(<span style="color:#a6e22e">commandArgs</span>(trailingOnly <span style="color:#f92672">=</span> <span style="color:#66d9ef">TRUE</span>), fread, sep<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;|&#34;</span>, +</span></span><span style="display:flex;"><span> header<span style="color:#f92672">=</span>T, select<span style="color:#f92672">=</span><span style="color:#ae81ff">1</span><span style="color:#f92672">:</span><span style="color:#ae81ff">2</span>, colClasses<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;integer64&#34;</span>) +</span></span><span style="display:flex;"><span> df <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">rbindlist</span>(dflist) <span style="color:#a6e22e">setNames</span>(df, <span style="color:#a6e22e">c</span>(<span style="color:#e6db74">&#34;P1&#34;</span>, <span style="color:#e6db74">&#34;P2&#34;</span>)) +</span></span><span style="display:flex;"><span>d2 <span style="color:#f92672">&lt;-</span> df[,<span style="color:#a6e22e">length</span>(P2),by<span style="color:#f92672">=</span>P1] +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">pdf</span>(<span style="color:#e6db74">&#34;socialdegreedist.pdf&#34;</span>) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">plot</span>(<span style="color:#a6e22e">ecdf</span>(d2<span style="color:#f92672">$</span>V1),main<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Cummulative distribution #friends per user&#34;</span>, +</span></span><span style="display:flex;"><span> xlab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Number of friends&#34;</span>, ylab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Percentage number of users&#34;</span>, log<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;x&#34;</span>, +</span></span><span style="display:flex;"><span> xlim<span style="color:#f92672">=</span><span style="color:#a6e22e">c</span>(<span style="color:#ae81ff">0.8</span>, <span style="color:#a6e22e">max</span>(d2<span style="color:#f92672">$</span>V1) <span style="color:#f92672">+</span> <span style="color:#ae81ff">20</span>)) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">dev.off</span>() +</span></span></code></pre></div><h3 id="data-correlations">Data Correlations</h3> +<p>Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.</p> +<p><em>Which are the most popular names of a country?</em></p> +<p>We run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, <em>&lsquo;A_country_name&rsquo;</em> is the name of a particular country such as <em>&lsquo;Germany&rsquo;, &lsquo;Netherlands&rsquo;, or &lsquo;Vietnam&rsquo;</em>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> p_lastname, <span style="color:#66d9ef">count</span> (p_lastname) <span style="color:#66d9ef">as</span> namecnt +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">FROM</span> person, country +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> p_placeid <span style="color:#f92672">=</span> ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;A_country_name&#39;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> p_lastname <span style="color:#66d9ef">order</span> <span style="color:#66d9ef">by</span> namecnt <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as <em>Muller</em> is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names&rsquo; distribution may not be exactly the same as the contemporary names&rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.</p> +<p><img src="distribution-germany.png" alt="image"> <br> +Figure 2. Distribution of names in Germany</p> +<p><img src="distribution-netherlands.png" alt=""> <br> +Figure 3. Distribution of names in Netherlands</p> +<p><img src="distribution-vietnam.png" alt=""> <br> +Figure 4. Distribution of names in Vietnam</p> +<p><em>Where my friends are living?</em></p> +<p>We run the following query, which computes the locations of the friends of people living in China.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> fctry.ctry_name, <span style="color:#66d9ef">count</span> (<span style="color:#f92672">*</span>) <span style="color:#66d9ef">from</span> person <span style="color:#66d9ef">self</span>, person +</span></span><span style="display:flex;"><span>friend, country pctry, knows, country fctry +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> pctry.ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;China&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> <span style="color:#66d9ef">self</span>.p_placeid <span style="color:#f92672">=</span> pctry.ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> <span style="color:#66d9ef">self</span>.p_personid <span style="color:#66d9ef">and</span> friend.p_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> fctry.ctry_city <span style="color:#f92672">=</span> friend.p_placeid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> fctry.ctry_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.</p> +<p><img src="chinese-friends.png" alt=""> <br> +Figure 5. Locations of friends of people in China</p> +<p><em>Where my friends are studying?</em></p> +<p>Finally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> o2.o_name, <span style="color:#66d9ef">count</span>(o2.o_name) <span style="color:#66d9ef">from</span> knows, person_university +</span></span><span style="display:flex;"><span>p1, person_university p2, organisation o1, organisation o2 +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> +</span></span><span style="display:flex;"><span> p1.pu_organisationid <span style="color:#f92672">=</span> o1.o_organisationid +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> o1.o_name<span style="color:#f92672">=</span><span style="color:#e6db74">&#39;Hangzhou_International_School&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> p1.pu_personid <span style="color:#66d9ef">and</span> p2.pu_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> p2.pu_organisationid <span style="color:#f92672">=</span> o2.o_organisationid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> o2.o_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).</p> +<p><img src="friends-international-school.png" alt=""> <br> +Figure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.</p> +<table> +<thead> +<tr> +<th>Name</th> +<th># of friends</th> +</tr> +</thead> +<tbody> +<tr> +<td>Hangzhou_International_School</td> +<td>12696</td> +</tr> +<tr> +<td>Anhui_University_of_Science_and_Technology</td> +<td>4071</td> +</tr> +<tr> +<td>China_Jiliang_University</td> +<td>3519</td> +</tr> +<tr> +<td>&hellip;</td> +<td></td> +</tr> +<tr> +<td>Darmstadt_University_of_Applied_Sciences</td> +<td>1</td> +</tr> +<tr> +<td>Calcutta_School_of_Tropical_Medicine</td> +<td>1</td> +</tr> +<tr> +<td>Chettinad_Vidyashram</td> +<td>1</td> +</tr> +<tr> +<td>Women&rsquo;s_College_Shillong</td> +<td>1</td> +</tr> +<tr> +<td>Universitas_Nasional</td> +<td>1</td> +</tr> +</tbody> +</table> +<p>Table 1. Universities where friends of Hangzhou International School students are studying at.</p> +<p>In a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.</p> + + + + + SNB Driver - Part 1 + https://ldbcouncil.org/post/snb-driver-part-1/ + Thu, 27 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-1/ + <p>In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: <a href="https://github.com/ldbc/ldbc_driver/">https://github.com/ldbc/ldbc_driver/</a>. Multiple reference implementations by two vendors are available here: <a href="https://github.com/ldbc/ldbc_snb_implementations">https://github.com/ldbc/ldbc_snb_implementations</a>, and discussion of the schema, data properties, and related content is available here: <a href="https://github.com/ldbc/ldbc_snb_docs">https://github.com/ldbc/ldbc_snb_docs</a>.</p> +<p>The following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.</p> +<h3 id="problem-description">Problem Description</h3> +<p>The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person&rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.</p> +<p>Consider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.</p> +<p>The next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.</p> + + + + + Making Semantic Publishing Execution Rules + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + Tue, 18 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + <p><a href="https://ldbcouncil.org/">LDBC</a> <a href="https://ldbcouncil.org/benchmarks/spb">SPB (Semantic Publishing Benchmark)</a> is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC&rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an <a href="https://www.ontotext.com/products/ontotext-graphdb-owlim/">Ontotext Graph DB</a> deployment. Graph DB was formerly known as Owlim.</p> +<p>So, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.</p> +<p>SPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:</p> +<ul> +<li> +<p>Some queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.</p> +</li> +<li> +<p>Some queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.</p> +</li> +</ul> +<p>These different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.</p> +<p>Another guiding factor of SPB was the BBC&rsquo;s and others&rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.</p> +<p>Normally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:</p> +<ul> +<li> +<p>Updates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.</p> +</li> +<li> +<p>Short queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.</p> +</li> +<li> +<p>Analytics - These cover a large fraction of the dataset and are roughly linear to data size.</p> +</li> +</ul> +<p>A test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.</p> +<p>The benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:</p> +<ul> +<li> +<p>46886 MB of RDF literal text</p> +</li> +<li> +<p>23924 MB of full text index for RDF literals</p> +</li> +<li> +<p>23598 MB of URI strings</p> +</li> +<li> +<p>21981 MB of quads, stored column-wise with default index scheme</p> +</li> +</ul> +<p>Clearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.</p> +<p>Let&rsquo;s now look at a full run at unit scale, i.e. 50M triples.</p> +<p>The run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.</p> +<p>The interactive driver reports:</p> +<pre tabindex="0"><code>Seconds run : 2144 + Editorial: + 2 agents + + 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) + 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) + 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) + + 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) + 39.7122 average operations per second + + Aggregation: + 20 agents + + 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) + 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) + 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) + 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) + 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) + 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) + 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) + 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) + 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) + 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) + 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) + + 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] + + 45318 total retrieval queries (0 timed-out) + 22.5239 average queries per second +</code></pre><p>The analytical driver reports:</p> +<pre tabindex="0"><code>Aggregation: + 2 agents + + 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) + 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) + 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) + 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) + 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) + 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) + 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) + 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) + 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) + 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) + 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) + 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) + 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) + 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) + 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] + + 180 total retrieval queries (2 timed-out) + 0.0862 average queries per second +</code></pre><p>The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)</p> +<p>The SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (<a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack at github.com</a>, <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics</a>).</p> +<p>The minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.</p> +<p>Is this good or bad? I would say that this is possible but better can certainly be accomplished.</p> +<p>The initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.</p> +<p>As an initial comment of the workload mix, I&rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.</p> +<p>Adjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.</p> +<p>In the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.</p> + + + + + Fifth TUC Meeting + https://ldbcouncil.org/event/fifth-tuc-meeting/ + Fri, 14 Nov 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fifth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its fifth Technical User<br> +Community (TUC) meeting.</p> +<p>This will be a one-day event at the National Hellenic Research Institute<br> +in Athens, Greece on <strong>Friday November 14, 2014</strong>.</p> +<h3 id="agenda">Agenda</h3> +<p>10:30 - 11:00 Coffee Break</p> +<p>11:00 - 11:10 Peter Boncz (VUA) Welcome &amp; LDBC project status update (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979841.pptx">Presentation</a>)</p> +<p>11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status</p> +<p>Feedback &amp; Roadmap for SPB &amp; OWLIM (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979839.pdf">Presentation</a>)</p> +<p>11:25 - 11:30 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SPB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979828.pdf">Presentation</a>)</p> +<p>11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback &amp; Roadmap for SNB &amp; Neo4J (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979830.pdf">Presentation</a>)</p> +<p>11:45 - 12:00 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SNB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979829.pdf">Presentation</a>)</p> +<p>12:00 - 12:20 Arnau Prat (UPC) &amp; Andrey Gubichev Status, Feedback &amp; Roadmap for SNB Interactive &amp; Sparksee (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979836.pdf">Presentation</a> ) and Business Intelligence (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979837.pdf">Presentation</a>)</p> +<p>12:20 - 12:40 Tomer Sagi, &ldquo;Experience with SNB and TitanDB at HP&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979838.pptx">Presentation</a> )</p> +<p>12:40 - 13:00 Jakob Nelson, &ldquo;graphbench.org on the SNB datagen&rdquo;</p> +<p>13:00 - 14:30 Lunch Break@Byzantine &amp; Christian Museum (<a href="http://www.byzantinemuseum.gr/en/">link</a>)</p> +<p>14:30 - 14:50 Olaf Hartig, &ldquo;Integrating the Property Graph and RDF data models&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979831.pdf">Presentation</a>)\</p> +<p>Documents: <a href="http://arxiv.org/abs/1409.3288">arxiv/1409.3288</a>, <a href="http://arxiv.org/abs/1406.3399">arxiv/1406.3399</a></p> +<p>14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, &ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979842.pdf">Presentation</a>)</p> +<p>15:10 - 15:30 Evaggelia Pitoura, &ldquo;Historical Queries on Graphs&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979835.pdf">Presentation</a>)</p> +<p>15:30 - 16:00 Coffee Break</p> +<p>16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, &ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979843.pdf">Presentation</a>)</p> +<p>16:20 - 16:40 Gunes Aluc, &ldquo;WatDiv: How to Tune-up your RDF Data Management System&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979832.pdf">Presentation</a>)</p> +<p>16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, &ldquo;Benchmarking @LogicBlox&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979840.pdf">Presentation</a>)</p> +<p>17:00 - 17:15 Hassan Chafi, &ldquo;Oracle Labs Graph Strategy&rdquo;</p> +<p>17:15 - 17:25 Yinglong Xia, &ldquo;Property Graphs for Industry Solution at IBM&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979834.pdf">Presentation</a>)</p> +<p>17:25 - 17:30 Arthur Keen, &ldquo;Short Introduction to SPARQLcity&rdquo;</p> +<p><em><strong>20:30 Dinner @ Konservokouti <a href="https://plus.google.com/114240752029716758955/about?gl=gr&amp;hl=en">(link)</a></strong></em></p> +<p><em><strong>Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion</strong></em></p> +<h4 id="logistics">Logistics</h4> +<p>The meeting will be held at the <a href="http://www.eie.gr/index-en.html">National Hellenic Research Foundation</a> located in <a href="http://www.eie.gr/location-en.html">downtown Athens</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/5964344.gif" alt=""></p> +<h4 id="travel">Travel</h4> +<p>Athens, Greece&rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.</p> +<p>To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).</p> +<p>You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: <a href="http://www.aia.gr/traveler/">http://www.aia.gr/traveler/</a></p> + + + + + Getting Started With the Semantic Publishing Benchmark + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + Sun, 09 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + <p>The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.</p> +<p>The benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:</p> +<ul> +<li>clustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time)</li> +<li>correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time)</li> +</ul> +<p>The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.</p> +<p>Currently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:</p> +<ul> +<li>join ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema</li> +<li>subselects that aggregate the query results that the optimiser should recognise and evaluate first</li> +<li>optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last</li> +<li>reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.)</li> +<li>unions to be executed in parallel</li> +<li>optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results</li> +<li>ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results</li> +<li>handling of geo-spatial predicates</li> +<li>full-text search optimisation</li> +<li>asynchronous execution of the aggregate sub-queries</li> +<li>use of distinct to choose the optimal query plan</li> +</ul> +<p>We give below Query 1 of the Semantic Publishing Benchmark.</p> +<pre tabindex="0"><code>PREFIX bbcevent:&lt;http://www.bbc.co.uk/ontologies/event/&gt; +PREFIX geo-pos:&lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; +PREFIX bbc:&lt;http://www.bbc.co.uk/ontologies/bbc/&gt; +PREFIX time:&lt;http://www.w3.org/2006/time#&gt; +PREFIX event:&lt;http://purl.org/NET/c4dm/event.owl#&gt; +PREFIX music-ont:&lt;http://purl.org/ontology/mo/&gt; +PREFIX rdf:&lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt; +PREFIX foaf:&lt;http://xmlns.com/foaf/0.1/&gt; +PREFIX provenance:&lt;http://www.bbc.co.uk/ontologies/provenance/&gt; +PREFIX owl:&lt;http://www.w3.org/2002/07/owl#&gt; +PREFIX cms:&lt;http://www.bbc.co.uk/ontologies/cms/&gt; +PREFIX news:&lt;http://www.bbc.co.uk/ontologies/news/&gt; +PREFIX cnews:&lt;http://www.bbc.co.uk/ontologies/news/cnews/&gt; +PREFIX cconcepts:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX dbp-prop:&lt;http://dbpedia.org/property/&gt; +PREFIX geonames:&lt;http://sws.geonames.org/&gt; +PREFIX rdfs:&lt;http://www.w3.org/2000/01/rdf-schema#&gt; +PREFIX domain:&lt;http://www.bbc.co.uk/ontologies/domain/&gt; +PREFIX dbpedia:&lt;http://dbpedia.org/resource/&gt; +PREFIX geo-ont:&lt;http://www.geonames.org/ontology#&gt; +PREFIX bbc-pont:&lt;http://purl.org/ontology/po/&gt; +PREFIX tagging:&lt;http://www.bbc.co.uk/ontologies/tagging/&gt; +PREFIX sport:&lt;http://www.bbc.co.uk/ontologies/sport/&gt; +PREFIX skosCore:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX dbp-ont:&lt;http://dbpedia.org/ontology/&gt; +PREFIX xsd:&lt;http://www.w3.org/2001/XMLSchema#&gt; +PREFIX core:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX curric:&lt;http://www.bbc.co.uk/ontologies/curriculum/&gt; +PREFIX skos:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX cwork:&lt;http://www.bbc.co.uk/ontologies/creativework/&gt; +PREFIX fb:&lt;http://rdf.freebase.com/ns/&gt; + +# Query Name : query1 +# Query Description : +# Retrieve creative works about thing t (or that mention t) +# reasoning: rdfs:subClassOf, rdf:type +# join ordering: cwork:dateModified rdf:type owl:FunctionalProperty +# join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty +# Choke Points : +# - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified +# Optimizer should use an efficient cost evaluation method for choosing the optimal join tree +# - A sub-select which aggregates results. Optimizer should recognize it and execute it first +# - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) +# Optimizer should decide to put optional triples on top of the join tree +# (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join +# - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork +# and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ + +CONSTRUCT { + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:shortTitle ?shortTitle ; + cwork:about ?about ; + cwork:mentions ?mentions ; + cwork:dateCreated ?created ; + cwork:dateModified ?modified ; + cwork:description ?description ; + cwork:primaryFormat ?primaryFormat ; + bbc:primaryContentOf ?webDocument . + ?webDocument bbc:webDocumentType ?webDocType . + ?about rdfs:label ?aboutLabel ; + bbc:shortLabel ?aboutShortLabel ; + bbc:preferredLabel ?aboutPreferredLabel . + ?mentions rdfs:label ?mentionsLabel ; + bbc:shortLabel ?mentionsShortLabel ; + bbc:preferredLabel ?mentionsPreferredLabel . + ?creativeWork cwork:thumbnail ?thumbnail . + ?thumbnail a cwork:Thumbnail ; + cwork:altText ?thumbnailAltText ; + cwork:thumbnailType ?thumbnailType . +} +WHERE { + { + SELECT ?creativeWork + WHERE { + ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . + ?creativeWork a cwork:CreativeWork ; + cwork:dateModified ?modified . + } + ORDER BY DESC(?modified) + LIMIT 10 + } + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:dateModified ?modified . + OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } + OPTIONAL { ?creativeWork cwork:description ?description . } + OPTIONAL { ?creativeWork cwork:about ?about . + OPTIONAL { ?about rdfs:label ?aboutLabel . } + OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } + OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } + } + OPTIONAL { + ?creativeWork cwork:mentions ?mentions . + OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } + OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } + OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } + } + OPTIONAL { ?creativeWork cwork:dateCreated ?created . } + OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } + OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . + OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } + OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } + } +} +</code></pre><p>Listing 1. Semantic Publishing Benchmark: Query 1</p> +<p>The benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.</p> +<p>The data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.</p> +<p>The generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).</p> +<p>Below we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.</p> +<pre tabindex="0"><code>Seconds run : 600 + Editorial: + 0 agents + + 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) + 0.0000 average operations per second + + Aggregation: + 8 agents + + 298 Q1 queries + 267 Q2 queries + 243 Q3 queries + 291 Q4 queries + 320 Q5 queries + 286 Q6 queries + 255 Q7 queries + 274 Q8 queries + 271 Q9 queries + + 2505 total retrieval queries + 4.1750 average queries per second +</code></pre><p>Listing 2. A snippet of semantic_publishing_benchmark_results.log</p> +<p>We run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:</p> +<table> +<thead> +<tr> +<th>#triples</th> +<th>Q1</th> +<th>Q2</th> +<th>Q3</th> +<th>Q4</th> +<th>Q5</th> +<th>Q6</th> +<th>Q7</th> +<th>Q8</th> +<th>Q9</th> +<th>#queries</th> +<th>avg. #q. per sec.</th> +</tr> +</thead> +<tbody> +<tr> +<td>10M</td> +<td>298</td> +<td>267</td> +<td>243</td> +<td>291</td> +<td>320</td> +<td>286</td> +<td>255</td> +<td>274</td> +<td>271</td> +<td>2505</td> +<td>41,750</td> +</tr> +<tr> +<td>100M</td> +<td>53</td> +<td>62</td> +<td>51</td> +<td>52</td> +<td>44</td> +<td>62</td> +<td>25</td> +<td>55</td> +<td>45</td> +<td>449</td> +<td>7,483</td> +</tr> +<tr> +<td>1B</td> +<td>34</td> +<td>29</td> +<td>22</td> +<td>24</td> +<td>25</td> +<td>29</td> +<td>0</td> +<td>29</td> +<td>28</td> +<td>220</td> +<td>3,667</td> +</tr> +</tbody> +</table> + + + + + Choke Point Based Benchmark Design + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + <p>The <em>Linked Data Benchmark Council</em> (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its <a href="https://ldbcouncil.org">https://ldbcouncil.org</a> website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (<a href="https://ldbcouncil.org/event/fifth-tuc-meeting">next TUC meeting</a> will be on October 5 in Athens) and indeed in <em>designing benchmarks</em>.</p> +<p>So, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by <a href="http://www.tpc.org/">TPC</a> have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are <em>relevant</em> and <em>representative</em> (address important challenges encountered in practice), <em>understandable</em> , <em>economical</em> (implementable on simple hardware), <em>fair</em> (such as not to favor a particular product or approach), <em>scalable</em>, <em>accepted</em> by the community and <em>public</em> (e.g. all of its software is available in open source). This list stems from Jim Gray&rsquo;s <a href="http://research.microsoft.com/en-us/um/people/gray/BenchmarkHandbook/TOC.htm">Benchmark Handbook</a>. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.</p> +<p>A very important aspect of benchmark development is making sure that the community <em>accepts</em> a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on <strong>fairness</strong> had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its <a href="https://ldbcouncil.org/tags/tuc-meeting/">Technical User Community gatherings</a>) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.</p> +<p>The need for <em>understandability</em> for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.</p> +<p>The <em>economical</em> aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.</p> +<p><em>Representative</em> benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the <a href="http://aksw.org/Projects/DBPSB.html">DBpedia benchmark</a> whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).</p> +<p>The fact that a benchmark can be <em>scaled</em> in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) &ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.</p> +<p>Now, what makes a benchmark <em>relevant</em>? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of <em>&ldquo;choke points&rdquo;</em>: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.</p> +<p>To illustrate what choke points are in more depth, we wrote a <a href="https://ldbcouncil.org/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf">paper in the TPCTC 2013</a> conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. <em><a href="chokepoints.png">This table</a></em> lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):</p> +<p>I would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can <a href="http://www.openlinksw.com/weblog/oerling/?id=1779">trivialize Q18</a>; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.</p> +<p>LDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark <a href="https://ldbcouncil.org/developer/spb">(SPB)</a> and the more graph-focused Social Network Benchmark (<a href="https://ldbcouncil.org/developer/snb">SNB</a>), and <a href="https://groups.google.com/forum/#!forum/ldbcouncil">tell us what you think</a>. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.</p> +<p><em>(for more posts from Peter Boncz, see also <a href="https://databasearchitects.blogspot.com">Database Architects</a>, a blog about data management challenges and techniques written by people who design and implement database systems)</em></p> + + + + + New Website Online LDBC Benchmarks Reach Public Draft + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + <p>The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.</p> +<p>The Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth <a href="https://ldbcouncil.org/event/fifth-tuc-meeting">Technical User Community meeting</a>. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.</p> +<p>You can also see that we created a this new website and a new logo. This website is different from <code>http://ldbc.eu</code> that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.</p> +<p>In the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.</p> + + + + + Social Network Benchmark Goals + https://ldbcouncil.org/post/social-network-benchmark-goals/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/social-network-benchmark-goals/ + <p>Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.</p> +<p>From a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.</p> +<p>With the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, <a href="https://ldbcouncil.org/benchmarks/snb">SNB</a>, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.</p> +<p>The SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">LDBC Github repository</a>. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD programming contest 2014</a>.</p> +<p>The SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.</p> +<p>More details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.</p> + + + + + Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + <p>It is with great pleasure that we announce the new LDBC organisation site at <a href="https://www.ldbcouncil.org">www.ldbcouncil.org</a>. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the <em>benchmarks</em> menu on this site.</p> +<p>Those benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.</p> +<p>While the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.</p> +<p>We want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.</p> +<p>Finally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.</p> +<p>In all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.</p> + + + + + 2nd International Workshop on Benchmarking RDF Systems + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + <p>Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.</p> +<p>More at: <a href="http://events.sti2.at/bersys2014/">http://events.sti2.at/bersys2014/</a></p> + + + + + DATAGEN: Data Generation for the Social Network Benchmark + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + <p>As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 <a href="#references">[1]</a>.</p> +<p>One of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.</p> +<p>For these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:</p> +<ul> +<li><strong>Realism.</strong> The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook <a href="#references">[2]</a>. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated.</li> +<li><strong>Scalability.</strong> Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters.</li> +<li><strong>Determinism.</strong> DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible.</li> +<li><strong>Usability.</strong> LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible.</li> +</ul> +<p>Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (<a href="https://github.com/ldbc/ldbc_snb_datagen)">https://github.com/ldbc/ldbc_snb_datagen)</a>.</p> +<h4 id="references">References</h4> +<p>[1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. &ldquo;S3g2: A scalable structure-correlated social graph generator.&rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.</p> +<p>[2] Prat-Pérez, Arnau, and David Dominguez-Sal. &ldquo;How community-like is the structure of synthetically generated graphs?.&rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.</p> + + + + + Getting Started With SNB + https://ldbcouncil.org/post/getting-started-with-snb/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-snb/ + <p>In a previous blog post titled &ldquo;<a href="https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/">Is SNB like Facebook&rsquo;s LinkBench?</a>&rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.</p> +<h3 id="datagen">DATAGEN</h3> +<p>DATAGEN is the data generator used by all the workloads of SNB. <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/">Here</a> we introduced the design goals that drive the development of DATAGEN, which can be summarized as: <em>Realism, Scalability, Determinism and Usability.</em></p> +<p>DATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.</p> +<p><img src="schema.png" alt="image"></p> +<p>For the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:</p> +<ul> +<li> +<p>Realistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.</p> +</li> +<li> +<p>Correlated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.</p> +</li> +</ul> +<p>DATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.</p> +<p>Finally, DATAGEN outputs two other things:</p> +<ul> +<li> +<p>Update Streams, which will be used in the future to implement updates in the workloads.</p> +</li> +<li> +<p>Substitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.</p> +</li> +</ul> +<p>Configuring and using DATAGEN is easy. Please visit <a href="https://github.com/ldbc/ldbc_snb_datagen">this page</a> for more information.</p> +<h3 id="ldbc-driver">LDBC driver</h3> +<p>SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.</p> +<p>It is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation <a href="https://github.com/ldbc/ldbc_driver/wiki">page</a>.</p> +<p>The test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.</p> +<p>Given a workload consisting of a series of <em>Operations</em>, the test sponsor implements <em>OperationHandlers</em> __ for them. <em>OperationHandlers</em> are responsible of executing instances of an specific operation (query) type. This is done by overriding the method <em>executeOperation</em>(), which receives as input parameter an <em>Operation</em> instance and returns the result. From <em>Operation</em> __ instance, the operation&rsquo;s input parameters can be retrieved, as well as the database connection state.</p> +<p>The database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the <em>Db</em> interface, which consists of three methods: <em>onInit</em>(), <em>onCleanup</em>() and <em>getConnectionState</em>(). <em>onInit</em>() is called before the benchmark is executed, and is responsible of initializing the database and registering the different <em>OperationHandlers</em>. <em>onCleanup</em>() is called after the benchmark has completed. Any resources that need to be released should be released here.</p> +<p>Finally, <em>getConnectionState</em>() returns an instance of <em>DbConnectionState</em>, which encapsulates any state that needs to be shared between <em>OperationHandler</em> instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.</p> +<p>A good example on how to implement the benchmark can be found <a href="https://github.com/ldbc/ldbc_driver/wiki/Implementing%20a%20Database%20Connector">here</a>.</p> +<h3 id="workloads">Workloads</h3> +<p>Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.</p> +<p>Interactive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.</p> +<p>Business Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.</p> +<p>Examples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.</p> +<p>Finally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.</p> +<h3 id="final-remarks">Final remarks</h3> +<p>This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification <a href="https://github.com/ldbc/ldbc_snb_docs">draft</a>, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.</p> + + + + + Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + <p>The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.</p> +<p>The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user&rsquo;s social environment and potentially access data associated with the friends or a user and their friends.</p> +<p>This is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.</p> +<p>SNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.</p> +<p>The challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.</p> +<p>The dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.</p> +<p>The metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.</p> +<p>Different technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API&rsquo;s.</p> +<p>SNB Interactive is an example of LDBC&rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art</p> +<p>The benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.</p> +<p>Subsequent posts will address the workload in more detail.</p> + + + + + Is SNB Like Facebooks LinkBench + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + <p>In this post, I will discuss in some detail the rationale and goals of the design of the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook&rsquo;s own graph benchmark called <a href="https://www.facebook.com/notes/facebook-engineering/linkbench-a-database-benchmark-for-the-social-graph/10151391496443920">LinkBench</a>. We think SNB is the most intricate graph database benchmark to date (it&rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference <a href="http://www.sigmod2014.org/">SIGMOD in Snowbird</a> after being used for this year&rsquo;s <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD Programming Contest</a>, which was about graph analytics.</p> +<p>SNB is intended to provide the following <strong>value</strong> to different stakeholders:</p> +<ul> +<li> +<p>For end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to <em>compare merits of different products</em> and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.</p> +</li> +<li> +<p>For vendors of graph database technology, SNB provides a <em>checklist of features</em> and performance characteristics that helps in product positioning and can serve to guide new development.</p> +</li> +<li> +<p>For researchers, both industrial and academic, the SNB dataset and workload provide <em>interesting challenges</em> in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.</p> +</li> +</ul> +<p>I should clarify that even though the data model of SNB resembles Facebook (and we&rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don&rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the <em>connection patterns</em> between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:</p> +<ul> +<li> +<p>the benchmark scenario should be <strong>understandable</strong> to a large audience, and this audience should also understand the relevance of managing such data.</p> +</li> +<li> +<p>the scenario in the benchmark should cover the complete range of challenges <strong>relevant</strong> for graph data management, according to the benchmark scope.</p> +</li> +<li> +<p>the query challenges in it should be <strong>realistic</strong> in the sense that, though synthetic, similar data and workloads are encountered in practice.</p> +</li> +</ul> +<p>The SNB is in fact three distinct benchmarks with a common dataset, since there are <em>three different workloads</em>. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.</p> +<ul> +<li> +<p><strong>Interactive Workload.</strong> The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j&rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system&rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a> and <a href="https://github.com/ldbc/ldbc_driver">driver software stack</a> are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.</p> +</li> +<li> +<p><strong>Business Intelligence Workload.</strong> There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.</p> +</li> +<li> +<p><strong>Graph Analytics Workload.</strong> This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.</p> +</li> +</ul> +<p>All the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a>. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to <a href="http://people.cs.uchicago.edu/~tga/pubs/sigmod-linkbench-2013.pdf">LinkBench</a>, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the <strong>low-level</strong> MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that <a href="http://borthakur.com/ftp/sigmod2013.pdf">Facebook&rsquo;s IT infrastructure</a> does not store all user data in MySQL and its modified memcached (&quot;<a href="http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/11730-atc13-bronson.pdf">TAO</a>&quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it <strong>does</strong> matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with <em>correlations</em>, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a &ldquo;choke point&rdquo; for graph data management system where LDBC wants to stimulate innovation).</p> + + + + + Making It Interactive + https://ldbcouncil.org/post/making-it-interactive/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-it-interactive/ + <p><em>Synopsis:</em> Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.</p> +<p>It is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.</p> +<p>As part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.</p> +<p>Now, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.</p> +<p>So, LDBC SNB has a twofold task:</p> +<ol> +<li>In order to be a credible interactive workload, it must in fact have characteristics of one</li> +<li>In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics.</li> +</ol> +<p>Designing a workload presents specific challenges:</p> +<ol> +<li>The workload must be realistic enough for users to identify with it.</li> +<li>The workload must pose challenges and drive innovation in a useful direction.</li> +<li>The component operations must all play a noticeable role in it. If the operation&rsquo;s relative performance doe does not affect the score, why is it in the workload?</li> +</ol> +<p>The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.</p> +<p>Very roughly, the choke points (technical challenges) of SNB interactive are as follows:</p> +<ul> +<li>Random access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored.</li> +<li>Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join.</li> +<li>When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations.</li> +<li>Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities.</li> +</ul> +<p>Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.</p> +<p>In online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.</p> +<p>A key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.</p> +<p>The other aspect is the metric, typically some variation on operations per unit of time.</p> +<p>All these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.</p> +<p>So how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.</p> +<p>But SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.</p> +<p>Having this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:</p> +<ul> +<li>Short queries should collectively be about 45% of the CPU load.</li> +<li>Updates will be under 5%</li> +<li>Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others.</li> +</ul> +<p>The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.</p> +<p>There is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.</p> +<p>In the next post we will look at the actual mix and execution times on the test system.</p> + + + + + SNB Data Generator - Getting Started + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">this</a> and <a href="https://ldbcouncil.org/post/getting-started-with-snb">this</a>) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.</p> +<h3 id="getting-and-configuring-hadoop">Getting and Configuring Hadoop</h3> +<p>DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz hadoop-1.2.1.tar.gz +</span></span></code></pre></div><p>For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/wiki/Configuration">LDBC DATAGEN wiki</a>.</p> +<h3 id="getting-and-configuring-datagen">Getting and configuring DATAGEN</h3> +<p>Before downloading DATAGEN, be sure to fulfill the following requirements:</p> +<ul> +<li>Linux based machine</li> +<li>java 1.6 or greater</li> +<li>python 2.7.X</li> +<li>maven 3</li> +</ul> +<p>After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/releases">here</a>. Again, decompress the downloaded file with the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz +</span></span></code></pre></div><p>This will create a folder called “ldbc_snb_datagen-0.1.2”.</p> +<p>DATAGEN provides a <em>run.sh</em> is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>HADOOP_HOME<span style="color:#f92672">=</span>/home/user/hadoop-1.2.1 +</span></span><span style="display:flex;"><span>LDBC_SNB_DATAGEN_HOME<span style="color:#f92672">=</span>/home/user/ldbc_snb_datagen +</span></span></code></pre></div><p>HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute <em>run.sh</em> script to compile and execute DATAGEN using default parameters. Type the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user/ldbc_snb_datagen-0.1.2 +</span></span><span style="display:flex;"><span>$ ./run.sh +</span></span></code></pre></div><p>This will run DATAGEN, and two folders will be created at the same directory: <em>social_network</em> containing the scale factor 1 dataset with csv uncompressed files, and <em>substitution_parameters</em> containing the substituion parameters needed by the driver to execute the benchmark.</p> +<h3 id="changing-the-generated-dataset">Changing the generated dataset</h3> +<p>The characteristics of the dataset to be generated are specified in the <em>params.ini</em> file. By default, this file has the following content:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:1</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:1</span> +</span></span></code></pre></div><p>The following is the list of options and their default values supported by DATAGEN:</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>scaleFactor</td> +<td>1</td> +<td>&ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000&rdquo;</td> +</tr> +<tr> +<td>serializer</td> +<td>csv</td> +<td>&ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl&rdquo;</td> +</tr> +<tr> +<td>compressed</td> +<td>FALSE</td> +<td>Specifies to compress the output data in gzip.</td> +</tr> +<tr> +<td>outputDir</td> +<td>./</td> +<td>Specifies the folder to output the data.</td> +</tr> +<tr> +<td>updateStreams</td> +<td>FALSE</td> +<td>&ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static&rdquo;</td> +</tr> +<tr> +<td>numThreads</td> +<td>1</td> +<td>Sets the number of threads to use. Only works for pseudo-distributed mode</td> +</tr> +</tbody> +</table> +<p>For instance, a possible <em>params.ini</em> file could be the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:30</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:ttl</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>numPersons</td> +<td>-</td> +<td>The number of persons to generate</td> +</tr> +<tr> +<td>numYears</td> +<td>-</td> +<td>The amount of years of activity</td> +</tr> +<tr> +<td>startYear</td> +<td>-</td> +<td>The start year of simulation.</td> +</tr> +</tbody> +</table> +<p>The following is an example of another possible <em>params.ini</em> file</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">numPersons:100000</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numYears:3</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">startYear:2010</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv_merge_foreign</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/">GitHub</a>!</p> + + + + + The Day of Graph Analytics + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + <p><em>Note: consider this post as a continuation of the &ldquo;<a href="https://ldbcouncil.org/post/making-it-interactive">Making it interactive</a>&rdquo; post by Orri Erling.</em></p> +<p>I have now completed the <a href="https://github.com/openlink/virtuoso-opensource">Virtuoso</a> TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. <a href="http://www.tpc.org/tpch/">TPC-H</a> is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.</p> +<p>So, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.</p> +<p>The BI part is like TPC-H, except for adding the following challenges:</p> +<ul> +<li> +<p>Joins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.</p> +</li> +<li> +<p>Transitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.</p> +</li> +<li> +<p>Transitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.</p> +</li> +<li> +<p>Graph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.</p> +</li> +<li> +<p>Running one query with parameters from different buckets, implying different best plan.</p> +</li> +<li> +<p>Data correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.</p> +</li> +<li> +<p>Large intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.</p> +</li> +<li> +<p>More unions and outer joins.</p> +</li> +</ul> +<p>The idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.</p> +<p>For rules and metric, we can use the TPC-H or <a href="http://www.tpc.org/tpcds/default.asp">TPC-DS</a> ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and <a href="http://www.openstreetmap.org/">Open Street Map</a>. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.</p> +<p>Doing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.</p> +<p>As a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or &ldquo;Platonic&rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.</p> +<p>So, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.</p> +<p>The distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.</p> +<p>This brings us to the graph analytics proper, which is often done in BSP style, e.g. <a href="http://es.slideshare.net/shatteredNirvana/pregel-a-system-for-largescale-graph-processing">Pregel</a>, <a href="http://giraph.apache.org">Giraph</a>, <a href="http://uzh.github.io/signal-collect/">Signal-Collect</a>, some but not all <a href="http://ppl.stanford.edu/main/green_marl.html">Green-Marl</a> applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.</p> +<p>With BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as <a href="http://graphbench.org/">graphbench.org</a> but will be adapted to the SNB dataset.</p> +<p>The analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.</p> + + + + + Using LDBC SPB to Find OWLIM Performance Issues + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + Wed, 20 Aug 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + <p>During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (LDBC-SPB) as a part of our development and release process.</p> +<p>First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.</p> +<p>Initially we’ve decided to fix some of the benchmark parameters :</p> +<ul> +<li>the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run.</li> +</ul> +<p>Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.</p> +<p>We’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.</p> +<p>Following two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.</p> +<p><img src="16-2-Performance.png" alt="image"></p> +<p>Figure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.</p> +<p><img src="8-0-Performance.png" alt="image"></p> +<p>Figure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.</p> +<p>Another thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark&rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.</p> +<p>An additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called <em>checkConformance</em>. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.</p> + + + + + Fourth TUC meeting + https://ldbcouncil.org/event/fourth-tuc-meeting/ + Thu, 03 Apr 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fourth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.</p> +<p>This will be a one-day event at CWI in Amsterdam on <em>Thursday April 3, 2014</em>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<p><strong>For presenters please limit your talks to just 15 minutes</strong></p> +<h3 id="agenda">Agenda</h3> +<p><strong>April 3rd</strong></p> +<ul> +<li> +<p>10:00 Peter Boncz (VUA) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506371.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=JYWVgrP1kVY">video</a>: <em>LDBC project status update</em></p> +</li> +<li> +<p>10:20 Norbert Martinez (UPC) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506375.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=4yREJQ3yDr0">video</a>: <em>Status update on the LDBC Social Network Benchmark (SNB) task force</em>.</p> +</li> +<li> +<p>10:50 Alexandru Iosup (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506363.ppt">ppt</a>, <a href="https://www.youtube.com/watch?v=ulT-RFwKpOE">video</a>: <em>Towards Benchmarking Graph-Processing Platforms</em></p> +</li> +<li> +<p>11:10 Mike Bryant (Kings College) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506364.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=KiHRTu9xx0A">video</a>: <em>EHRI Project: Archival Integration with Neo4j</em></p> +</li> +</ul> +<p><strong>11:30 coffee</strong></p> +<ul> +<li> +<p>11:50 Thilo Muth (University of Magdeburg) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506369.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=5xH3UDLP6Oc">video</a>: <em>MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis</em></p> +</li> +<li> +<p>12:10 Davy Suvee (Janssen Pharmaceutica / Johnson &amp; Johnson) – <a href="https://www.youtube.com/watch?v=XN3LRJUfJIU">video</a>: <em>Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph</em></p> +</li> +<li> +<p>12:30 Yongming Luo (TU Eindhoven) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506366.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=g_my3tBB2_s">video</a>: <em>Regularities and dynamics in bisimulation reductions of big graphs</em></p> +</li> +<li> +<p>12:50 Christopher Davis (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506370.pdf">pdf</a>, <a href="https://www.youtube.com/channel/UC6HbzfJ4016Vez-2HKNeDag">video</a>: <em>Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues</em></p> +</li> +</ul> +<p><strong>13:10 - 14:30 lunch @ restaurant Polder</strong></p> +<ul> +<li> +<p>14:30 <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506365.pptx">SPB task force report</a></p> +</li> +<li> +<p>15:00 Bastiaan Bijl (Sysunite) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506373.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=TsCeKDHShMY">video</a>: <em>Using a semantic approach for monitoring applications in large engineering projects</em></p> +</li> +<li> +<p>15:20 Frans Knibbe (Geodan) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506372.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=uAX-m4OewPM">video</a>: <em>Benchmarks for geographical data</em></p> +</li> +<li> +<p>15:40 Armando Stellato (University of Rome, Tor Vergata &amp; UN Food and Agriculture Organization) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506374.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=mfA4csAs72Y">video</a>: <em>VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges</em></p> +</li> +</ul> +<p><strong>16:00 coffee</strong></p> +<ul> +<li> +<p>16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), <a href="https://www.youtube.com/watch?v=ZUDnVw9P_Rc">video</a>:<em>Customer experiences in implementing SKOS-based vocabularymanagement systems</em></p> +</li> +<li> +<p>16:40 Simon Jupp (European Bioinformatics Institute) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506368.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=CgTuOGK92W8">video</a>: <em>[Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]</em></p> +</li> +<li> +<p>17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506381.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=QTc3yOgoEsg">video</a>: <em>Breakmarking UniProt RDF. SPARQL queries that make your database cry&hellip;</em></p> +</li> +<li> +<p>17:20 Rein van &rsquo;t Veer (Digital Heritage Netherlands) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506380.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=2vDrZoskGyQ">video</a> <em>Time and space for heritage</em></p> +</li> +<li> +<p>17:40 <strong>end of meeting</strong></p> +</li> +<li> +<p>19:00 - 21:30 Social Dinner in restaurant Boom</p> +</li> +</ul> +<p><strong>April 4th</strong></p> +<p>LDBC plenary meeting for project partners.</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506362.ppt">Benchmarking Graph-Processing Platforms: A Vision</a> – Alexandru Iosup</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p>The meeting will be held at the Dutch national research institute for computer science and mathematics (<a href="http://www.cwi.nl">CWI</a> - Centrum voor Wiskunde en Informatica). It is located at <a href="http://www.amsterdamsciencepark.nl/">Amsterdam Science Park</a>:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505821.jpg" alt=""></p> +<p>(<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505820.pdf">A5 map</a>)</p> +<h6 id="travel">Travel</h6> +<p><strong>Arriving &amp; departing:</strong></p> +<p>Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, <a href="http://www.schiphol.com/">www.schiphol.nl</a>) that serves all main European carriers and also very many low-fare carriers.</p> +<p><a href="http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane">http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane</a></p> +<p><strong>Trains</strong> (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) &ndash; which station you are also likely arriving at in case of an international train trip.</p> +<p>From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 &ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).</p> +<p><strong>Taxi</strong> is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).</p> +<p><strong>Public transportation</strong> (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.</p> +<p><strong>Only the &ldquo;disposable&rdquo; cards are interesting for you as visitor.</strong></p> +<p>Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.</p> +<p><strong>Getting Around:</strong> the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.</p> +<p><strong>Cars</strong></p> +<p>In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the &ldquo;WCW&rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.</p> +<p><strong>Arriving at CWI:</strong> Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you&rsquo;ll receive a visitor&rsquo;s pass that allows you to enter our building.</p> +<p><strong>Social Dinner</strong></p> +<p>The social dinner will take place at 7pm on April 3 in Restaurant Boom (<a href="http://www.boometenendrinken.nl/">boometenendrinken.nl</a>), Linneausstraat 63, Amsterdam.</p> + + + + + Third TUC Meeting + https://ldbcouncil.org/event/third-tuc-meeting/ + Tue, 19 Nov 2013 08:00:00 +0000 + + https://ldbcouncil.org/event/third-tuc-meeting/ + <p>The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!</p> +<p>This will be a one day event in London on the <strong>19 November 2013</strong> running in collaboration with the <a href="http://www.graphconnect.com/london/">GraphConnect</a> event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: <strong>LDBCTUC</strong>.</p> +<p>The TUC event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology</li> +<li>Industry discussions on the contents of the benchmarks</li> +</ul> +<p>We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.</p> +<p>We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a></li> +<li><a href="#ldbctuc-background">LDBC/TUC Background</a> +<ul> +<li><a href="#social-network-benchmark">Social Network Benchmark</a></li> +<li><a href="#semantic-publishing-benchmark">Semantic Publishing Benchmark</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>November 19th - Public TUC Meeting</strong></p> +<p>8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)</p> +<p>short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)</p> +<p>NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.</p> +<p>10:00 TUC Meeting Opening (Peter Boncz)</p> +<p>10:10 TUC Presentations (RDF Application Descriptions)</p> +<ul> +<li>Johan Hjerling (BBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275669.pdf">BBC Linked Data and the Semantic Publishing Benchmark</a></strong></em></li> +<li>Andreas Both (Unister): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505027.pdf">Ontology-driven applications in an e-commerce context</a></strong></em></li> +<li>Nuno Carvalho (Fujitsu Laboratories Europe): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275666.pdf"><em><strong>Fujitsu RDF use cases and benchmarking requirements</strong></em></a></li> +<li>Robina Clayphan (Europeana): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816977.ppt">Europeana and Open Data</a></strong></em></li> +</ul> +<p>11:30 Semantic Publishing Benchmark (SPB)</p> +<ul> +<li>Venelin Kotsev (Ontotext - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">Semantic Publishing Benchmark Task Force Update</a></strong></em> and <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">report</a></strong></em></li> +</ul> +<p>12:00-13:00 Lunch at the Graph Connect venue</p> +<p><em>Talks During Lunch:</em></p> +<ul> +<li>Pedro Furtado, Jorge Bernardino (Univ. Coimbra): <strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275671.pdf">KEYSTONE Cost Action</a></strong></li> +</ul> +<p>13:00 TUC Presentations (Graph Application Descriptions)</p> +<ul> +<li>Minqi Zhou / Weining Qian (East China Normal University): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275670.pdf">Elastic and realistic social media data generation</a></strong></em></li> +<li>Andrew Sherlock (Shapespace): <em><strong>Shapespace Use Case</strong></em></li> +<li>Sebastian Verheughe (Telenor): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275667.pdf">Real-time Resource Authorization</a></strong></em></li> +</ul> +<p>14:00 Social Network Benchmark (SNB)</p> +<ul> +<li>Norbert Martinez (UPC - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505025.pdf">Social Network Benchmark Task Force Update</a></strong></em> and <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816975.pdf">Report</a></li> +</ul> +<p><em>14:30 Break</em></p> +<p>14:45 TUC Presentations (Graph Analytics)</p> +<ul> +<li>Keith Houck (IBM): <em><strong>Benchmarking experiences with [System G Native Store (tentative title)]</strong></em></li> +<li>Abraham Bernstein (University of Zurich): <em><strong>Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store</strong></em></li> +<li>Luis Ceze (University of Washington): <em><strong>Grappa and GraphBench Status Update</strong></em></li> +</ul> +<p><em>15:45 Break</em></p> +<p>16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*</p> +<ul> +<li>Christian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): <em><strong>CIDOC-CRM</strong></em></li> +<li>Atanas Kiryakov (Ontotext): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275672.pdf">Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM)</a></strong></em></li> +<li>Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275668.pdf">Geographica: A Benchmark for Geospatial RDF Stores</a></strong></em></li> +<li>Xavier Lopez (Oracle): <em><strong>W3C Property Graph progress</strong></em></li> +<li>Thomas Scharrenbach (University Zurich) <em><strong>PCKS: Benchmarking Semantic Flow Processing Systems</strong></em></li> +</ul> +<p>17:20 Meeting Conclusion (Josep Larriba Pey)</p> +<p>17:30 End of TUC meeting</p> +<p>19:00 Social dinner</p> +<p><strong>November 20th - Internal LDBC Meeting</strong></p> +<p>10:00 Start</p> +<p>12:30 <em>End of meeting</em></p> +<ul> +<li>coffee and lunch provided</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p><strong>Date</strong></p> +<p>19th November 2013</p> +<p><strong>Location</strong></p> +<p>The TUC meeting will be held in <strong>The Tower</strong> hotel (<a href="http://goo.gl/qZt8Fz">Google Maps link</a>) approximately 4 minutes walk from the <a href="http://www.graphconnect.com/london/">GraphConnect</a> conference in London.</p> +<p>Getting there</p> +<ul> +<li>From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.</li> +<li>From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554995.pdf">See attached</a>.</li> +</ul> +<h3 id="ldbctuc-background">LDBC/TUC Background</h3> +<p>Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SNB_Report_Nov2013.pdf</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SPB_Report_Nov2013.pdf</a></li> +</ul> +<p>A summary of these efforts can be read below or, for a more detailed account, please refer to: <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554967.pdf">The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort</a>. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.</p> +<h4 id="social-network-benchmark">Social Network Benchmark</h4> +<p>The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:</p> +<ul> +<li>it should be understandable, and the relevance of managing such data should be understandable</li> +<li>it should cover the complete range of interesting challenges, according to the benchmark scope</li> +<li>the queries should be realistic, i.e., similar data and workloads are encountered in practice</li> +</ul> +<p>SNB includes a data generator for creation of synthetic social network data with the following characteristics:</p> +<ul> +<li>data schema is representative of real social networks</li> +<li>data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions</li> +<li>the software generator is easy-to-use, configurable and scalable</li> +</ul> +<p>SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:</p> +<ul> +<li><strong>Interactive</strong> +<ul> +<li>Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.</li> +<li>The targeted systems are expected to be those that offer transactional functionality.</li> +</ul> +</li> +<li><strong>Business Intelligence</strong> +<ul> +<li>Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.</li> +<li>The targeted systems are expected to be those that offer an abstract query language.</li> +</ul> +</li> +<li><strong>Graph Analytics</strong> +<ul> +<li>Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.</li> +<li>Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.</li> +<li>The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.</li> +</ul> +</li> +</ul> +<h4 id="semantic-publishing-benchmark">Semantic Publishing Benchmark</h4> +<p>The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.</p> +<p>The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works &ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.</p> +<p>The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.</p> +<p>Two separate workloads are modeled in SPB:</p> +<ul> +<li><strong>Editorial:</strong> Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.</li> +<li><strong>Aggregation:</strong> Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as &ldquo;dynamic&rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.</li> +</ul> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505026.pdf">Status of the Semantic Publishing Benchmark</a></p> + + + + + Second TUC Meeting + https://ldbcouncil.org/event/second-tuc-meeting/ + Mon, 22 Apr 2013 10:00:00 +0000 + + https://ldbcouncil.org/event/second-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.</p> +<p>This will be a two day event in Munich on the <strong>22/23rd April 2013</strong>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +<li><a href="#venue">Venue</a> +<ul> +<li><a href="#getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</a></li> +<li><a href="#getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</a></li> +<li><a href="#getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</a></li> +</ul> +</li> +<li><a href="#getting-there">Getting there</a></li> +<li><a href="#social-dinner">Social Dinner</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>April 22nd</strong></p> +<p>10:00 <em>Registration.</em><br> +10:30 Josep Lluis Larriba Pey (UPC) - <em>Welcome and Introduction.</em><br> +10:30 Peter Boncz (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687373.pptx">LDBC: goals and status</a></p> +<p><em>Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)</em></p> +<p>11:00 Josep Lluis Larriba Pey (UPC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687372.pdf">Social Network Benchmark Task Force</a><br> +11:30 Gustavo González (Mediapro): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687367.pdf">Graph-based User Modeling through Real-time Social Streams</a><br> +12:00 Klaus Großmann (Dshini): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687365.pdf">Neo4j at Dshini</a></p> +<p>12:30 Lunch</p> +<p><em>Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)</em></p> +<p>13:30 Barry Bishop (Ontotext): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687366.pptx">Semantic Publishing Benchmark Task Force</a><br> +14:00 Dave Rogers (BBC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687364.pptx">Linked Data Platform at the BBC</a><br> +14:30 Edward Thomas (Wolters Kluwer): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687374.pdf">Semantic Publishing at Wolters Kluwer</a></p> +<p>15:00 Coffee break</p> +<p><em>Projects Related to LDBC</em></p> +<p>15:30 Fabian Suchanek (MPI): &ldquo;YAGO: A large knowledge base from Wikipedia and WordNet&rdquo;<br> +16:00 Antonis Loziou (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687375.pptx">The OpenPHACTS approach to data integration</a><br> +16:30 Mirko Kämpf (Brox): &ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case&rdquo;</p> +<p>17:00 <em>End of first day</em></p> +<p>19:00 Social dinner</p> +<p><strong>April 23rd</strong></p> +<p><em>Industry &amp; Hardware Aspects</em></p> +<p>10:00 Xavier Lopez (Oracle): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687384.pdf">Graph Database Performance an Oracle Perspective.pdf</a><br> +10:30 Pedro Trancoso (University of Cyprus): &ldquo;Benchmarking and computer architecture: the research side&rdquo;</p> +<p>11:00 Coffee break</p> +<p><em>Future Steps and TUC feedback session</em></p> +<p>11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force<br> +12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force&quot;</p> +<p>12:30 <em>End of meeting</em></p> +<h3 id="logistics">Logistics</h3> +<h4 id="date">Date</h4> +<p>22nd and 23th April 2013</p> +<h4 id="location">Location</h4> +<p>The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:</p> +<p>LRZ (Leibniz-Rechenzentrum)<br> +Boltzmannstraße 1<br> +85748 Garching, Germany</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi and Subway <a href="http://www.in.tum.de/fileadmin/user_upload/Sonstiges/anfahrt_garching.pdf">Ubahn</a></p> +<h5 id="getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</h5> +<p>Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.</p> +<h5 id="getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</h5> +<ol> +<li> +<p>(except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.</p> +</li> +<li> +<p>S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.</p> +</li> +<li> +<p>Taxi: fare is ca. 30-40 euros.</p> +</li> +</ol> +<p>For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.</p> +<h5 id="getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</h5> +<p>The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.</p> +<p><strong>Finding LRZ@TUM</strong></p> +<p><a href="http://www.openstreetmap.org/?mlat=48.2615702464&amp;mlon=11.6686558264&amp;zoom=32">OpenStreetMap link</a></p> +<p><a href="https://maps.google.com/maps?q=48.2615702464,11.6686558264&amp;spn=0.005,0.005&amp;t=k">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687268.gif" alt=""></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687269.gif" alt=""></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying: Munich</strong> airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.</p> +<p><strong>S-Bahn:</strong> S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.</p> +<p><strong>Taxi:</strong> taxi from the airport to the city center costs approximately 50 euros</p> +<h4 id="social-dinner">Social Dinner</h4> +<p>The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)</p> +<p>Address: Hofbräuhaus, Platzl 9, Munich</p> + + + + + First TUC Meeting + https://ldbcouncil.org/event/first-tuc-meeting/ + Mon, 19 Nov 2012 09:00:00 +0100 + + https://ldbcouncil.org/event/first-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the <strong>19/20th November 2012</strong>.</p> +<p>So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:</p> +<ul> +<li>Introduction by the coordinator and technical director explaining the objectives of the LDBC project</li> +<li>Invitation to users to explain their use-cases and describe the limitations they have found in current technology</li> +<li>Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points</li> +</ul> +<p>The exact agenda will be published here as things get finalised before the event.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#slide">Slide</a> +<ul> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +</ul> +</li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.</p> +<p><strong>Day 1</strong></p> +<p>09:00 Welcome (Location: Aula Master)<br> +09:30 Project overview (Emphasis on task forces?) + Questionnaire results?<br> +10:30 Coffee break<br> +11:00 User talks (To gather information for use cases?)</p> +<p>13:00 Lunch</p> +<p>14:00 User talks (cont.)<br> +15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).<br> +16:00 Task force proposals (consortium)<br> +17:00 Finish first day</p> +<p>20:00 Social dinner</p> +<p><strong>Day 2</strong></p> +<p>10:00 Task force discussion (consortium + TUC)<br> +11:00 Coffe break<br> +11:30 Task force discussion (consortium + TUC)<br> +12:30 Summaries (Task forces, use cases, &hellip;) and actions</p> +<p>13:00 Lunch and farewell</p> +<p>15:00 LDBC Internal meeting</p> +<h3 id="slide">Slide</h3> +<p>Opening session:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686995.pptx">CWI – Peter Boncz</a> – Objectives</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687001.pdf">UPC – Larri</a> – Questionnaire</li> +</ul> +<p>User stories:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686998.pdf">BBC – Jem Rayfield</a></li> +<li>CA Technologies – Victor Muntés</li> +<li>Connected Discovery (Open Phacts) – Bryn Williams-Jones</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687003.pptx">Elsevier – Alan Yagoda</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687000.pptx">ERA7 Bioinformatics – Eduardo Pareja</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687005.pptx">Press Association – Jarred McGinnis</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687004.pptx">RJLee – David Neuer</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686994.pdf">Yale – Lec Maj</a></li> +</ul> +<p>Benchmark proposals:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686991.pdf">Publishing benchmark proposal – Ontotext – Barry Bishop</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687002.pdf">Social Network Benchmark Proposal – UPC – Larri</a></li> +</ul> +<h4 id="logistics">Logistics</h4> +<h5 id="date">Date</h5> +<p>19th and 20th November 2012</p> +<h5 id="location">Location</h5> +<p>The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<p><strong>Finding UPC</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<p><strong>Finding the meeting room</strong></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<p><strong>The locations of the airport and the city centre</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933317.jpg" alt=""></p> +<p><strong>Bus map</strong></p> + + + + + \ No newline at end of file diff --git a/event/ninth-tuc-meeting/index.html b/event/ninth-tuc-meeting/index.html new file mode 100644 index 00000000..df12e3e5 --- /dev/null +++ b/event/ninth-tuc-meeting/index.html @@ -0,0 +1,696 @@ + + + + + Ninth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Ninth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Marcus Paradies / on 14 Feb 2017
+ + + +
+ Location: SAP Headquarters in Walldorf Germany +
+
+ + Event dates: 09 Feb 2017 15:07 -- 10 Feb 2017 15:07 (local timezone) + +
+
+

LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.

+

This will be a two-day event at SAP Headquarters in Walldorf, Germany on February 9+10, 2017.

+

This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:

+
    +
  • Two day event with one day devoted to User’s experiences and one day devoted to benchmarking experiences.
  • +
  • Presentation of the benchmarking results for the different benchmarks.
  • +
  • Interaction with the new LDBC Board of Directors and the LDBC organisation officials.
  • +
+

We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu;

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.

+

Agenda

+

In the TUC meeting there will be

+
    +
  • updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.
  • +
  • talks by data management practitioners highlighting graph data management challenges
  • +
  • selected scientific talks on graph data management technology
  • +
+

The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.

+

Thursday evening (19:00-21:00) there will be a social dinner in Heidelberg.

+

Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.

+

Social Dinner

+

+

Address: Hauptstraße 217, 69117 Heidelberg
+Time: 19:00 / 7pm

+

(See attachments at the bottom of the page)

+
Thursday
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
start timetitle – speaker
9:00Welcome and logistics - Marcus Paradies (SAP)
9:10Intro + state of the LDBC - Josep Lluis Larriba Pey (UPC)
9:20LDBC Graph QL task force - Hannes Voigt (TU Dresden)
9:40PGQL Status Update and Comparison to LDBC’s Graph QL proposals - Oskar van Rest (Oracle Labs)
10:00Adding shortest-paths to MonetDB - Dean de Leo (CWI)
10:20coffee
10:50Evolving Cypher for processing multiple graphs - Stefan Plantikow (Neo Technology)
11:10Standardizing Graph Database Functionality - An Invitation to Collaborate - Jan Michels (ISO/ANSI SQL, Oracle)"
11:30Dgraph: Graph database for production environment - Tomasz Zdybal (Dgraph.io)
12:00lunch
13:00LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap - Alexandru Iosup (TU Delft)
13:20LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)
13:40LDBC SNB Datagen Update - Arnau Prat (UPC)
14:00LDBC SNB Business Intelligence Workload: Chokepoint Analysis - Arnau Prat (UPC)
14:20LDBC Benchmark Cost Specification (+discussion) - Moritz Kaufmann (TU Munich)
14:40coffee break
15:10EYWA: the Distributed Graph Engine in Huawei MIND Platform (Yinglong Xia)
15:30Graph Processing in SAP HANA - Marcus Paradies (SAP)
15:50Distributed Graph Analytics with Gradoop - Martin Junghanns (Univ Leipzig)
16:10Distributed graph flows: Cypher on Flink and Gradoop - Max Kießling (Neo Technology)
16:30closing - Peter Boncz
17:30end
+
Friday
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
start timetitle – speaker
9:00welcome - Peter Boncz
9:20Graph processing in obi4wan - Frank Smit (OBI4WAN)
9:40Graph problems in the space domain - Albrecht Schmidt (ESA)
10:00Medical Ontologies for Healthcare - Michael Neumann (SAP)
10:20coffee
10:50The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries - Gabor Szarnyas (BME)
11:10Efficient sparse matrix computations and their generalization to graph computing applications - Albert-Jan Yzelman (Huawei)
11:30Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge - Atanas Kyriakov (Ontotext)
12:00lunch
13:00LDBC Board of Directors Meeting
17:00end
+

Logistics

+
Important things to know
+

The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: link

+
Venue
+

The TUC meeting will be held in the SAP Headquarters at the SAP Guesthouse Kalipeh (https://www.kalipeh.com). The address is:

+

WDF 44 / SAP Guesthouse Kalipeh
+Dietmar-Hopp-Allee 15
+69190 Walldorf
+Germany

+
Maps and situation
+

Google Maps link

+

+

Getting there

+
By plane
+

There are two airports close to SAP’s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.

+

When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.

+

The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).

+

Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).

+
Driving directions
+

Traveling from Frankfurt Airport (FRA) to SAP Headquarters:

+

Directions to SAP headquarters:

+
    +
  • When leaving the airport, follow the highway symbol onto “A3/Würzburg/A5/Kassel/Basel/Frankfurt.”
  • +
  • Follow the A5 to “Basel/Karlsruhe/Heidelberg.”
  • +
  • Take exit 39 – “Walldorf/Wiesloch.”
  • +
  • Turn left onto B291.
  • +
  • Turn right onto Dietmar-Hopp-Allee.
  • +
+

(Should you use a navigational system which does not recognize the street name ‘Dietmar-Hopp-Allee’ please use ‘Neurottstrasse’ instead.)

+

Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:

+

To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.

+

Directions to SAP headquarters:

+
    +
  • When leaving the airport, follow the highway symbol onto “A8/Stuttgart/B27.”
  • +
  • Stay on A8 and follow the sign for “Karlsruhe/Heilbronn/Singen/A8.”
  • +
  • Follow A8 to Karlsruhe.
  • +
  • Take exit 41 – “Dreieck Karlsruhe” to merge onto A5 toward “Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).”
  • +
  • Take exit 39 – “Walldorf/Wiesloch.”
  • +
  • Turn left onto B291.
  • +
  • Turn right onto Dietmar-Hopp-Allee.
  • +
+
Parking
+

The closest parking lot to the event location is P7 (see figure above).

+
By Train
+

As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.

+

From Frankfurt Airport (FRA) to SAP Headquarters

+

Directions to SAP headquarters:

+
    +
  • Go to Terminal 1, level T (see overview in Appendix).
  • +
  • Go to the AIRail Terminal – “Fernbahnhof” (long-distance trains).
  • +
  • Choose a connection with the destination train station “Wiesloch–Walldorf”.
  • +
  • From station “Wiesloch–Walldorf,” take bus number 707 or 721 toward “Industriegebiet Walldorf, SAP.” It is a 10-minute ride to reach bus stop ‘SAP headquarters’.
  • +
+

From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters

+

Directions to SAP headquarters:

+
    +
  • Go to the S-Bahn station in the airport, following the sign (station is called “Stuttgart Flughafen/Messe”).
  • +
  • Take train number S2 or S3 to “Stuttgart Hauptbahnhof” (main station).
  • +
  • From Stuttgart Hauptbahnhof choose a connection with the destination train station “Wiesloch–Walldorf”.
  • +
  • From station “Wiesloch–Walldorf,” take bus number 707 or 721 toward “Industriegebiet Walldorf, SAP”. It is a 10-minute ride to reach bus stop ‘SAP headquarters’.
  • +
+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/page/1/index.html b/event/page/1/index.html new file mode 100644 index 00000000..7fa970fd --- /dev/null +++ b/event/page/1/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/event/ + + + + + + diff --git a/event/page/2/index.html b/event/page/2/index.html new file mode 100644 index 00000000..27a84405 --- /dev/null +++ b/event/page/2/index.html @@ -0,0 +1,671 @@ + + + + + Events + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Events

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Tenth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

This will be a one-day event at the VLDB 2017 conference in Munich, Germany on September 1, 2017.

+

Topics and activities of interest in these TUC meetings are:

+
    +
  • Presentation on graph data management usage scenarios.
  • +
  • Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.
  • +
  • Interaction with the new LDBC Board of Directors and the LDBC organisation officials.
  • +
+

We welcome all users of RDF …

+ +
+
+ +
+ + +
+
+
+ +

Ninth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.

+

This will be a two-day event at SAP Headquarters in Walldorf, Germany on February 9+10, 2017.

+

This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:

+
    +
  • Two day event with one day devoted to User’s experiences and one day devoted to benchmarking experiences.
  • +
  • Presentation of the …
+ +
+
+ +
+ + +
+
+
+ +

Eighth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.

+

This will be a two-day event/eighth-tuc-meeting/attachments at Oracle Conference Center in Redwood Shores facility on Wednesday and Thursday June 22-23, 2016.

+

This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:

+
    +
  • Two day …
+ +
+
+ +
+ + +
+
+
+ +

Seventh TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.

+

This will be a two-day event at IBM’s TJ Watson facility on Monday and Tuesday November 9/10, 2015.

+

This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:

+
    +
  • Two day event with one day devoted to User’s experiences and one day devoted to benchmarking …
+ +
+
+ +
+ + +
+
+
+ +

Sixth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.

+

This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on Thursday and Friday March 19/20, 2015.

+

The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will …

+ +
+
+ +
+ + +
+
+
+ +

Fifth TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium are pleased to announce its fifth Technical User
+Community (TUC) meeting.

+

This will be a one-day event at the National Hellenic Research Institute
+in Athens, Greece on Friday November 14, 2014.

+

Agenda

+

10:30 - 11:00 Coffee Break

+

11:00 - 11:10 Peter Boncz (VUA) Welcome & LDBC project status update (Presentation)

+

11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status

+

Feedback …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/page/3/index.html b/event/page/3/index.html new file mode 100644 index 00000000..f18a2dce --- /dev/null +++ b/event/page/3/index.html @@ -0,0 +1,611 @@ + + + + + Events + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Events

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Fourth TUC meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.

+

This will be a one-day event at CWI in Amsterdam on Thursday April 3, 2014.

+

The event will include:

+
    +
  • Introduction to the objectives and progress of the LDBC project.
  • +
  • Description of the progress of the benchmarks being evolved through Task Forces.
  • +
  • Users explaining their use-cases and describing the limitations they have found in current technology. …
+ +
+
+ +
+ + +
+
+
+ +

Third TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!

+

This will be a one day event in London on the 19 November 2013 running in collaboration with the GraphConnect event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: LDBCTUC.

+

The TUC event will include:

+
    +
  • Introduction to the objectives and …
+ +
+
+ +
+ + +
+
+
+ +

Second TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.

+

This will be a two day event in Munich on the 22/23rd April 2013.

+

The event will include:

+
    +
  • Introduction to the objectives and progress of the LDBC project.
  • +
  • Description of the progress of the benchmarks being evolved through Task Forces.
  • +
  • Users explaining their use-cases and describing the limitations they have found in current technology.
  • +
  • Industry …
+ +
+
+ +
+ + +
+
+
+ +

First TUC Meeting

+
Tags:
+ TUC MEETING + +
+
+ +

The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the 19/20th November 2012.

+

So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:

+
    +
  • Introduction by the coordinator and technical director explaining the objectives of the …
+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/second-tuc-meeting/index.html b/event/second-tuc-meeting/index.html new file mode 100644 index 00000000..ccc89d84 --- /dev/null +++ b/event/second-tuc-meeting/index.html @@ -0,0 +1,552 @@ + + + + + Second TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Second TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Josep Larriba Pey / on 25 Apr 2013
+ + + +
+ Location: Munich, Germany +
+
+ + Event dates: 22 Apr 2013 10:00 -- 23 Apr 2021 17:00 (local timezone) + +
+
+

The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.

+

This will be a two day event in Munich on the 22/23rd April 2013.

+

The event will include:

+
    +
  • Introduction to the objectives and progress of the LDBC project.
  • +
  • Description of the progress of the benchmarks being evolved through Task Forces.
  • +
  • Users explaining their use-cases and describing the limitations they have found in current technology.
  • +
  • Industry discussions on the contents of the benchmarks.
  • +
+

All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu

+ +

Agenda

+

April 22nd

+

10:00 Registration.
+10:30 Josep Lluis Larriba Pey (UPC) - Welcome and Introduction.
+10:30 Peter Boncz (VUA): LDBC: goals and status

+

Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)

+

11:00 Josep Lluis Larriba Pey (UPC): Social Network Benchmark Task Force
+11:30 Gustavo González (Mediapro): Graph-based User Modeling through Real-time Social Streams
+12:00 Klaus Großmann (Dshini): Neo4j at Dshini

+

12:30 Lunch

+

Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)

+

13:30 Barry Bishop (Ontotext): Semantic Publishing Benchmark Task Force
+14:00 Dave Rogers (BBC): Linked Data Platform at the BBC
+14:30 Edward Thomas (Wolters Kluwer): Semantic Publishing at Wolters Kluwer

+

15:00 Coffee break

+

Projects Related to LDBC

+

15:30 Fabian Suchanek (MPI): “YAGO: A large knowledge base from Wikipedia and WordNet”
+16:00 Antonis Loziou (VUA): The OpenPHACTS approach to data integration
+16:30 Mirko Kämpf (Brox): “GeoKnow - Spatial Data Web project and Supply Chain Use Case”

+

17:00 End of first day

+

19:00 Social dinner

+

April 23rd

+

Industry & Hardware Aspects

+

10:00 Xavier Lopez (Oracle): Graph Database Performance an Oracle Perspective.pdf
+10:30 Pedro Trancoso (University of Cyprus): “Benchmarking and computer architecture: the research side”

+

11:00 Coffee break

+

Future Steps and TUC feedback session

+

11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force
+12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force"

+

12:30 End of meeting

+

Logistics

+

Date

+

22nd and 23th April 2013

+

Location

+

The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:

+

LRZ (Leibniz-Rechenzentrum)
+Boltzmannstraße 1
+85748 Garching, Germany

+

Venue

+

To reach the campus, there are several options, including Taxi and Subway Ubahn

+
Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)
+

Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.

+
Getting to the TUM Campus from the Munich Airport
+
    +
  1. +

    (except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.

    +
  2. +
  3. +

    S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.

    +
  4. +
  5. +

    Taxi: fare is ca. 30-40 euros.

    +
  6. +
+

For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.

+
Getting to the TUM Campus from Garching: U-Bahn
+

The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.

+

Finding LRZ@TUM

+

OpenStreetMap link

+

Google Maps link

+

+

+

Getting there

+

Flying: Munich airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.

+

S-Bahn: S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.

+

Taxi: taxi from the airport to the city center costs approximately 50 euros

+

Social Dinner

+

The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)

+

Address: Hofbräuhaus, Platzl 9, Munich

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/seventh-tuc-meeting/index.html b/event/seventh-tuc-meeting/index.html new file mode 100644 index 00000000..da21ef07 --- /dev/null +++ b/event/seventh-tuc-meeting/index.html @@ -0,0 +1,551 @@ + + + + + Seventh TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Seventh TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Ioan Toma / on 26 Dec 2015
+ + + +
+ Location: IBM's TJ Watson, US +
+
+ + Event dates: 09 Nov 2015 14:17 -- 10 Nov 2015 14:17 (local timezone) + +
+
+

The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.

+

This will be a two-day event at IBM’s TJ Watson facility on Monday and Tuesday November 9/10, 2015.

+

This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:

+
    +
  • Two day event with one day devoted to User’s experiences and one day devoted to benchmarking experiences.
  • +
  • Presentation of the benchmarking results for the different benchmarks.
  • +
  • Interaction with the new LDBC Board of Directors and the LDBC organisation officials.
  • +
+

We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu; in order to notify IBM security in advance, registration requests need to be in by Nov 1.

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.

+

In this page, you’ll find information about the following items:

+ +

Agenda

+

Monday, 9th of November 2015

+

8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)

+

9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)

+

9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)

+

9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload

+

10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload

+

10:30-11:00 Coffee break

+

11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)

+

11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.

+

11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.

+

12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status

+

12:30 - 14:00 Lunch break

+

14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)

+

14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox

+

14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today’s architectures

+

15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G

+

15:30-16:00 Coffee break

+

16:00 - 17:00 Technologies (chair Irini Fundulaki)

+

16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store

+

16:30 David Ediger (GeorgiaTech). STINGER

+

17:00 Gary King (Franz Inc.). AllegroGraph’s SPARQL implementation with Social Network Analytics abilities using Magic Properties

+

17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics

+

18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase

+

19:00 Social dinner

+

Tuesday 10th November 2015

+

9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)

+

9:00 Philip Rathle (Neo). On openCypher

+

9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification

+

9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions

+

10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation

+

10:30 - 11:00 Coffee break

+

11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)

+

11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL

+

11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,

+

11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis

+

12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives

+

12:30 - 14:00 Lunch break

+

14:00 LDBC Board of Directors

+

Logistics

+
Date
+

9th and 10th November 2015

+
Venue
+

The TUC meeting will be held in the IBM Thomas J Watson Research Center.
+The address is:

+

IBM Thomas J Watson Research Center
+1101 Kitchawan Rd,
+Yorktown Heights, NY 10598, USA

+

If you are using a GPS system, please enter “200 Aqueduct Road, Ossining NY, 10562” for accurate directions to the lab entrance. You may also want to check the routing online.

+

The meeting will take place in the Auditorium on November 9th, and in Meeting Room 20-043 on November 10th.

+
Maps and situation
+

You are highly suggested to rent a car for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through www.orbitz.com, or www.expedia.com Feel free to email yxia@us.ibm.com for any questions.

+

+
Getting there
+

Upper and Eastern New England

+

Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.

+

New Haven and Connecticut Shores

+

Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.

+

New Jersey

+

Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.

+

Upstate New York

+

Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.

+

New York City (Manhattan)

+

Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.

+

John F. Kennedy International Airport

+

North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.

+

LaGuardia Airport

+

East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.

+

Newark International Airport

+

North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.

+

Stewart International Airport

+

Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.

+

Westchester County Airport

+

Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.

+

Public Transportation

+

Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/sixteenth-tuc-meeting/index.html b/event/sixteenth-tuc-meeting/index.html new file mode 100644 index 00000000..cb20f281 --- /dev/null +++ b/event/sixteenth-tuc-meeting/index.html @@ -0,0 +1,732 @@ + + + + + Sixteenth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Sixteenth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Oskar van Rest, Gábor Szárnyas / on 14 Feb 2023
+ + + +
+ Location: Seattle +
+
+ + Event dates: 23 Jun 2023 09:00 -- 24 Jun 2023 18:00 (local timezone) + +
+
+

Organizers: Oskar van Rest, Alastair Green, Gábor Szárnyas

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2023 on June 23-24 (Friday-Saturday).

+

The program consists of 10- and 15-minute talks followed by a Q&A session. The talks will be recorded and made available online. If you would like to participate please register using our form.

+

LDBC will host a social event on Friday at the Black Bottle gastrotavern in Belltown: 2600 1st Ave (on the corner of Vine), Seattle, WA 98121.

+

In addition, AWS will host a Happy Hour (rooftop grill with beverages) on Saturday on the Amazon Nitro South building’s 8th floor deck: 2205 8th Ave, Seattle, WA 98121.

+

Program

+

All times are in PDT.

+

Friday

+

Location: Hyatt Regency Bellevue on Seattle’s Eastside, room Grand K, co-located with SIGMOD (900 Bellevue Way NE, Bellevue, WA 98004-4272)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startfinishspeakertitle
08:3008:45Oskar van Rest (Oracle)LDBC – State of the union – slides, video
08:5009:05Keith Hare (JCC / WG3)An update on the GQL & SQL/PGQ standards efforts – slides, video
09:1009:25Stefan Plantikow (Neo4j / WG3)GQL - Introduction to a new query language standard – slides
09:3009:45Leonid Libkin (University of Edinburgh & RelationalAI)Formalizing GQL – slides, video
09:5010:05Semen Panenkov (JetBrains Research)Mechanizing the GQL semantics in Coq – slides, videos
10:1010:25Oskar van Rest (Oracle)SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – slides, video
10:3011:00coffee break
11:0011:15Alastair Green (JCC)LDBC’s organizational changes and fair use policies – slides
11:2011:35Ioana Manolescu (INRIA)Integrating Connection Search in Graph Queries – slides, video
11:4011:55Maciej Besta (ETH Zurich)Neural Graph Databases with Graph Neural Networks – video
12:0012:10Longbin Lai (Alibaba Damo Academy)To Revisit Benchmarking Graph Analytics – slides, video
12:1513:30lunch
13:3013:45Yuanyuan Tian (Gray Systems Lab, Microsoft)The World of Graph Databases from An Industry Perspective – slides, video
13:5014:05Alin Deutsch (UC San Diego & TigerGraph)TigerGraph’s Parallel Computation Model – slides, video
14:1014:25Chen Zhang (CreateLink)Applications of a Native Distributed Graph Database in the Financial Industry – video
14:3014:45Ricky Sun (Ultipa)Design of highly scalable graph database systems – slides, video
14:5015:30coffee break
15:3015:45Heng Lin (Ant Group)The LDBC SNB implementation in TuGraph – slides, video
15:5016:05Shipeng Qi (Ant Group)FinBench: The new LDBC benchmark targeting financial scenario – slides, video
16:1017:00host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)FinBench panel – slides
19:0022:00dinnerBlack Bottle gastrotavern in Belltown: 2600 1st Ave (on the corner of Vine), Seattle, WA 98121
+

Saturday

+

Location: Amazon Nitro South building, room 03.204 (2205 8th Ave, Seattle, WA 98121)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
startfinishspeakertitle
09:0009:45Brad Bebee (AWS)Customers don’t want a graph database, so why are we still here? – slides, video
10:0010:15Muhammad Attahir Jibril (TU Ilmenau)Fast and Efficient Update Handling for Graph H2TAP – slides, video
10:2011:00coffee break
11:0011:15Gabor Szarnyas (CWI)LDBC Social Network Benchmark and Graphalytics – slides
11:2011:30Atanas Kiryakov and Tomas Kovachev (Ontotext)GraphDB – Benchmarking against LDBC SNB & SPB – slides, video
11:3511:50Roi Lipman (Redis Labs)Delta sparse matrices within RedisGraph – slides, video
11:5512:05Rathijit Sen (Microsoft)Microarchitectural Analysis of Graph BI Queries on RDBMS – slides, video
12:1013:30lunchon your own
13:3013:45Alastair Green (JCC)LEX – LDBC Extended GQL Schema – slides, video
13:5014:05Ora Lassila (AWS)Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – slides, video
14:1014:25Jan Hidders (Birkbeck, University of London)PG-Schema: a proposal for a schema language for property graphs – slides, video
14:3014:45Max de Marzi (RageDB and RelationalAI)RageDB: Building a Graph Database in Anger – slides, video
14:5015:30coffee break
15:3015:45Umit Catalyurek (AWS)HPC Graph Analytics on the OneGraph Model – slides, video
15:5016:05David J. Haglin (Trovares)How LDBC impacts Trovares – slides, video
16:1016:25Wenyuan Yu (Alibaba Damo Academy)GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – slides, video
16:3016:40Scott McMillan (Carnegie Mellon University)Graph processing using GraphBLAS – slides, video
16:4516:55Tim Mattson (Intel)Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – slides
17:0020:00happy hour (rooftop grill with beverages)on the Nitro South building’s 8th floor deck
+

TUC event locations

+

A map of the LDBC TUC events we hosted so far.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/sixth-tuc-meeting/index.html b/event/sixth-tuc-meeting/index.html new file mode 100644 index 00000000..b8979cc1 --- /dev/null +++ b/event/sixth-tuc-meeting/index.html @@ -0,0 +1,521 @@ + + + + + Sixth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Sixth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by / on 23 Apr 2015
+ + + +
+ Location: Barcelona, March +
+
+ + Event dates: 19 Mar 2015 13:53 -- 20 Mar 2015 13:53 (local timezone) + +
+
+

The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.

+

This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on Thursday and Friday March 19/20, 2015.

+

The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:

+
    +
  • Two day event with one day devoted to User’s experiences and one day devoted to benchmarking experiences.
  • +
  • Presentation of the first benchmarking results for the different benchmarks.
  • +
  • Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.
  • +
  • Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.
  • +
+

We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact damaris@ac.upc.edu.

+

Agenda

+

Thursday 19th March

+

11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)

+

11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – slides

+

12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)

+

12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – slides

+

12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain

+

12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive

+

13:10 Claudio Martella (VUA): Giraph and Lighthouse

+

13:30 - 14:30 Lunch break

+

14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)

+

14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production slides

+

14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services slides

+

15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph slides

+

15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs

+

18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.

+

20:00 Social dinner at Bastaix Restaurant.

+

Friday 20th March

+

9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)

+

9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics

+

9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms

+

10:10 John Snelson (MarkLogic): Introduction to MarkLogic

+

10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload

+

10:50 Moritz Kaufmann. The auditing experience

+

11:15 - 11:45 Coffee break

+

11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)

+

11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox

+

12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data

+

12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments

+

12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. slides

+

13:30 - 14:30 Lunch break

+

15:00 LDBC Board of Directors

+

Logistics

+
Date
+

19th and 20th March 2015

+
Venue
+

The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord UPC” in Barcelona. The address is:

+

Aula Master
+Edifici A3, Campus Nord UPC
+C. Jordi Girona, 1-3
+08034 Barcelona, Spain

+
Maps and situation
+

To reach the campus, there are several options, including Taxi, Metro and Bus.

+

+
Finding UPC
+

+
Finding the meeting room
+
Getting there
+

Flying: Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this map of the airport). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.

+

Rail: The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to
+the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.

+

Bus: The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.

+

Taxi: From the airport, you can take one of Barcelona’s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.

+

Train and bus: Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: http://www.barcelona-airport.com/eng/transport_eng.htm

+

+
The locations of the airport and the city centre
+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/tenth-tuc-meeting/index.html b/event/tenth-tuc-meeting/index.html new file mode 100644 index 00000000..48f18cd2 --- /dev/null +++ b/event/tenth-tuc-meeting/index.html @@ -0,0 +1,508 @@ + + + + + Tenth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Tenth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Peter Boncz / on 05 Nov 2017
+ + + +
+ Location: +
+
+ + Event date: 01 Sep 2017 10:30 (local timezone) + +
+
+

This will be a one-day event at the VLDB 2017 conference in Munich, Germany on September 1, 2017.

+

Topics and activities of interest in these TUC meetings are:

+
    +
  • Presentation on graph data management usage scenarios.
  • +
  • Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.
  • +
  • Interaction with the new LDBC Board of Directors and the LDBC organisation officials.
  • +
+

We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at adiaz@ac.upc.edu to register; registration is free, but required.

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.

+

Agenda

+

In the TUC meeting there will be:

+
    +
  • updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.
  • +
  • talks by data management practitioners highlighting graph data management challenges
  • +
  • selected scientific talks on graph data management technology
  • +
+

The meeting will start on Friday morning, with a program from 10:30-17:00

+

10:30-12:00: TUC session (public)

+ +

12:00-13:30: lunch break

+

13:30-15:00: TUC session (public)

+ +

15:00-15:30: break

+

15:30-17:00: TUC session (public)

+ +

Speakers should aim for a 20-minute talk.

+

Further:

+
    +
  • on Friday evening (19:00-21:00) there will be a social dinner at Löwenbräukeller, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).
  • +
  • on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.
  • +
+

Venue

+

The Technical University of Munich (TUM) is hosting that week the VLDB conference; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.

+

The TUC meeting will be held in in Room 2607 alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).

+

address: Technische Universität München (TUM), Arcisstraße 21, 80333 München

+

Google Maps

+


+

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/third-tuc-meeting/index.html b/event/third-tuc-meeting/index.html new file mode 100644 index 00000000..be1b3064 --- /dev/null +++ b/event/third-tuc-meeting/index.html @@ -0,0 +1,596 @@ + + + + + Third TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Third TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Peter Boncz / on 04 Apr 2021
+ + + +
+ Location: London, United Kingdom +
+
+ + Event date: 19 Nov 2013 08:00 (local timezone) + +
+
+

The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!

+

This will be a one day event in London on the 19 November 2013 running in collaboration with the GraphConnect event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: LDBCTUC.

+

The TUC event will include:

+
    +
  • Introduction to the objectives and progress of the LDBC project
  • +
  • Description of the progress of the benchmarks being evolved through Task Forces
  • +
  • Users explaining their use-cases and describing the limitations they have found in current technology
  • +
  • Industry discussions on the contents of the benchmarks
  • +
+

We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.

+

We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.

+

All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu

+ +

Agenda

+

November 19th - Public TUC Meeting

+

8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)

+

short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)

+

NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.

+

10:00 TUC Meeting Opening (Peter Boncz)

+

10:10 TUC Presentations (RDF Application Descriptions)

+ +

11:30 Semantic Publishing Benchmark (SPB)

+ +

12:00-13:00 Lunch at the Graph Connect venue

+

Talks During Lunch:

+ +

13:00 TUC Presentations (Graph Application Descriptions)

+ +

14:00 Social Network Benchmark (SNB)

+ +

14:30 Break

+

14:45 TUC Presentations (Graph Analytics)

+
    +
  • Keith Houck (IBM): Benchmarking experiences with [System G Native Store (tentative title)]
  • +
  • Abraham Bernstein (University of Zurich): Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store
  • +
  • Luis Ceze (University of Washington): Grappa and GraphBench Status Update
  • +
+

15:45 Break

+

16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*

+ +

17:20 Meeting Conclusion (Josep Larriba Pey)

+

17:30 End of TUC meeting

+

19:00 Social dinner

+

November 20th - Internal LDBC Meeting

+

10:00 Start

+

12:30 End of meeting

+
    +
  • coffee and lunch provided
  • +
+

Logistics

+

Date

+

19th November 2013

+

Location

+

The TUC meeting will be held in The Tower hotel (Google Maps link) approximately 4 minutes walk from the GraphConnect conference in London.

+

Getting there

+
    +
  • From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.
  • +
  • From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. See attached.
  • +
+

LDBC/TUC Background

+

Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:

+ +

A summary of these efforts can be read below or, for a more detailed account, please refer to: The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.

+

Social Network Benchmark

+

The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:

+
    +
  • it should be understandable, and the relevance of managing such data should be understandable
  • +
  • it should cover the complete range of interesting challenges, according to the benchmark scope
  • +
  • the queries should be realistic, i.e., similar data and workloads are encountered in practice
  • +
+

SNB includes a data generator for creation of synthetic social network data with the following characteristics:

+
    +
  • data schema is representative of real social networks
  • +
  • data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions
  • +
  • the software generator is easy-to-use, configurable and scalable
  • +
+

SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:

+
    +
  • Interactive +
      +
    • Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.
    • +
    • The targeted systems are expected to be those that offer transactional functionality.
    • +
    +
  • +
  • Business Intelligence +
      +
    • Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.
    • +
    • The targeted systems are expected to be those that offer an abstract query language.
    • +
    +
  • +
  • Graph Analytics +
      +
    • Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.
    • +
    • Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.
    • +
    • The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.
    • +
    +
  • +
+

Semantic Publishing Benchmark

+

The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.

+

The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works – input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.

+

The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.

+

Two separate workloads are modeled in SPB:

+
    +
  • Editorial: Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.
  • +
  • Aggregation: Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as “dynamic”, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.
  • +
+

Status of the Semantic Publishing Benchmark

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/thirteenth-tuc-meeting/index.html b/event/thirteenth-tuc-meeting/index.html new file mode 100644 index 00000000..934ebdcc --- /dev/null +++ b/event/thirteenth-tuc-meeting/index.html @@ -0,0 +1,478 @@ + + + + + Thirteenth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Thirteenth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by Peter Boncz / on 26 Jun 2020
+ + + +
+ Location: Online (Zoom) +
+
+ + Event dates: 30 Jun 2020 14:00 -- 01 Jul 2020 17:00 (local timezone) + +
+
+

LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.

+

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.

+

This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.

+

SNB Task Force

+
    +
  • Progress report +
      +
    • ACID compliance test suite
    • +
    • Integrating deletions to Datagen
    • +
    • Migrating Datagen to Spark
    • +
    • Redesign of BI read queries
    • +
    • Extensions to the driver
    • +
    +
  • +
  • Ongoing work +
      +
    • Datagen: tuning the distribution of deletes
    • +
    • Interactive 2.0 workload
    • +
    • BI 1.0 workload
    • +
    +
  • +
+

Zoom links will be sent through email.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/event/twelfth-tuc-meeting/index.html b/event/twelfth-tuc-meeting/index.html new file mode 100644 index 00000000..3076aac3 --- /dev/null +++ b/event/twelfth-tuc-meeting/index.html @@ -0,0 +1,529 @@ + + + + + Twelfth TUC Meeting + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Twelfth TUC Meeting

+ + + + +
+
+
+
+ + + + +
+
+
+
+ + +
by / on 11 Jul 2019
+ + + +
+ Location: Amsterdam, the Netherlands +
+
+ + Event date: 05 Jul 2019 08:30 (local timezone) + +
+
+

LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.

+

LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.

+

This TUC meeting will be a one-day event on the last Friday of SIGMOD/PODS 2019 in Amsterdam, The Netherlands, in the conference venue of Beurs van Berlage. The room is the Mendes da Silva kamer. Please check its tips for accommodation in Amsterdam.

+

Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called GRADES-NDA 2019, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).

+

We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu to register.

+

=> registration is free, but required <=

+

You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.

+

In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.

+

Talk proposals can be sent to Peter Boncz, who is also the local organizer. Please also send your slides to this email for archiving on this site.

+

Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.

+

Agenda

+

In the TUC meeting, there will be:

+
    +
  • updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.
  • +
  • talks by data management practitioners highlighting graph data management challenges and products
  • +
+

The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).

+

The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (room: Mendes da Silva kamer):

+

08:30-10:30 LDBC Board Meeting (non-public)

+

10:30-11:00 Coffee

+

11:00-12:45 Session 1: Graph Benchmarks

+ +

12:45-14:00 Lunch

+

14:00-16:05 Session 2: Graph Query Languages

+ +

16:05-16:30 Coffee

+

16:30-17:50 Session 3: Graph System Performance

+ +

If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.

+ +
+ +
+
Tags:
+ +
+ + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/events/index.html b/events/index.html new file mode 100644 index 00000000..7803d855 --- /dev/null +++ b/events/index.html @@ -0,0 +1,10 @@ + + + + /event + + + + + + diff --git a/gql-community/elwg/index.html b/gql-community/elwg/index.html new file mode 100644 index 00000000..d3e5a2b6 --- /dev/null +++ b/gql-community/elwg/index.html @@ -0,0 +1,371 @@ + + + + + Existing Languages Working Group (ELWG) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Existing Languages Working Group (ELWG)

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+
    +
  • Group leader: Petra Selmer (Neo4j)
  • +
  • Focus: Surveying existing graph query languages
  • +
+

Group members

+
    +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Frank Celler (ArangoDB)
  • +
  • Victor Lee (TigerGraph)
  • +
  • Harsh Thakkar (Consultant OSTHUS GmBH)
  • +
  • Jeffrey Lovitz (RedisGraph)
  • +
  • Renzo Angles (Universidad de Talca)
  • +
+ +
+
+ +
+
+
+
+
+
+
+
+

Latest Working Group Updates

+
+
+

+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/fswg/index.html b/gql-community/fswg/index.html new file mode 100644 index 00000000..45f2cc4e --- /dev/null +++ b/gql-community/fswg/index.html @@ -0,0 +1,362 @@ + + + + + Formal Semantics Working Group (FSWG) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Formal Semantics Working Group (FSWG)

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+
    +
  • Group leaders: Leonid Libkin (ENS Paris, University of Edinburgh), Paolo Guagliardo (University of Edinburgh)
  • +
  • Focus: Establishing formal semantics for the upcoming GQL language
  • +
+ +
+
+ +
+
+
+
+
+
+
+
+

Latest Working Group Updates

+
+
+

+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/index.html b/gql-community/index.html new file mode 100644 index 00000000..927e29b0 --- /dev/null +++ b/gql-community/index.html @@ -0,0 +1,765 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+ +
+ +

We are delighted to announce the official release of the initial version (v0.1.0) of Financial Benchmark (FinBench).

+

The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the LDBC FinBench Task Force. The benchmark has one workload currently, Transaction Workload, capturing OLTP scenario with complex read queries that access the …

+ +
+
+ +
+ + +
+
+
+ +

Posts

+
Tags:
+ +
+
+ + + +
+
+ +
+ + +
+
+
+ +

LDBC SNB – Early 2023 updates

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

2023 has been an eventful year for us so far. Here is a summary of our recent activities.

+
    +
  1. +

    Our paper The LDBC Social Network Benchmark: Business Intelligence Workload was published in PVLDB.

    +
  2. +
  3. +

    David Püroja just completed his MSc thesis on creating a design towards SNB Interactive v2 at CWI’s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference’s graph developer room titled The LDBC Social Network …

+ +
+
+ +
+ + +
+
+
+ +

LDBC SNB Datagen – The winding path to SF100K

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my last technical update on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the …

+ +
+
+ +
+ + +
+
+ +
+ +

We are delighted to announce the set up of the Financial Benchmark (FinBench) task force.

+

The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the …

+ +
+
+ +
+ + +
+
+
+ +

Speeding Up LDBC SNB Datagen

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC’s Social Network Benchmark [4] (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems’ bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/index.xml b/gql-community/index.xml new file mode 100644 index 00000000..ab8631b8 --- /dev/null +++ b/gql-community/index.xml @@ -0,0 +1,5297 @@ + + + + Gql-communities on Linked Data Benchmark Council + https://ldbcouncil.org/gql-community/ + Recent content in Gql-communities on Linked Data Benchmark Council + Hugo -- gohugo.io + en-us + &copy; Copyright LDBC 2024 + + Announcing the Official Release of LDBC Financial Benchmark v0.1.0 + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + Tue, 27 Jun 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + <p>We are delighted to announce the official release of the initial version (v0.1.0) of <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench)</a>.</p> +<p>The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">LDBC FinBench Task Force</a>. The benchmark has one workload currently, <strong>Transaction Workload</strong>, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.</p> +<p>Compared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the <a href="https://ldbcouncil.org/benchmarks/finbench/finbench-talk-16th-tuc.pdf">slides</a> in the 16th TUC. The <a href="https://arxiv.org/pdf/2306.15975.pdf">Financial Benchmark&rsquo;s specification</a> can be found on arXiv.</p> +<p>The release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.</p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.</p> + + + + + Sixteenth TUC Meeting + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + Fri, 23 Jun 2023 09:00:00 -0800 + + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Oskar van Rest, Alastair Green, Gábor Szárnyas</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2023.sigmod.org/venue.shtml">SIGMOD 2023</a> on <strong>June 23-24 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10- and 15-minute talks followed by a Q&amp;A session. The talks will be recorded and made available online. <strong>If you would like to participate please register using <a href="https://forms.gle/T6bwVHzK9V5FaKyR9">our form</a>.</strong></p> +<p>LDBC will host a <strong>social event</strong> on Friday at the <a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a>.</p> +<p>In addition, AWS will host a <strong>Happy Hour</strong> (rooftop grill with beverages) on Saturday on the Amazon Nitro South building&rsquo;s 8th floor deck: <a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>.</p> +<h3 id="program">Program</h3> +<p><strong>All times are in PDT.</strong></p> +<h4 id="friday">Friday</h4> +<p><strong>Location:</strong> Hyatt Regency Bellevue on Seattle&rsquo;s Eastside, <strong>room Grand K</strong>, co-located with SIGMOD (<a href="https://www.hyatt.com/en-US/hotel/washington/hyatt-regency-bellevue-on-seattles-eastside/belle">900 Bellevue Way NE, Bellevue, WA 98004-4272</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>08:30</td> +<td>08:45</td> +<td>Oskar van Rest (Oracle)</td> +<td>LDBC – State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-ldbc-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/Frk7ITssaSY">video</a></td> +</tr> +<tr> +<td>08:50</td> +<td>09:05</td> +<td>Keith Hare (JCC / WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/keith-hare-an-update-on-the-gql-and-sql-pgq-standards-efforts.pdf">slides</a>, <a href="https://youtu.be/LQYkal_0j6E">video</a></td> +</tr> +<tr> +<td>09:10</td> +<td>09:25</td> +<td>Stefan Plantikow (Neo4j / WG3)</td> +<td>GQL - Introduction to a new query language standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/stefan-plantikow-gql-v1.pdf">slides</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Leonid Libkin (University of Edinburgh &amp; RelationalAI)</td> +<td>Formalizing GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/leonid-libkin-formalizing-gql.pdf">slides</a>, <a href="https://youtu.be/YZE1a00h1I4">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Semen Panenkov (JetBrains Research)</td> +<td>Mechanizing the GQL semantics in Coq – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/semyon-panenkov-gql-in-coq.pdf">slides</a>, <a href="https://youtu.be/5xBGohqWCzo">videos</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Oskar van Rest (Oracle)</td> +<td>SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-sql-property-graphs-in-oracle-database-and-oracle-graph-server-pgx.pdf">slides</a>, <a href="https://youtu.be/owM9WiQubpg">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (JCC)</td> +<td>LDBC&rsquo;s organizational changes and fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-ldbc-corporate-restructuring-and-fair-use-policies.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>Ioana Manolescu (INRIA)</td> +<td>Integrating Connection Search in Graph Queries – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ioana-manolescu-integrating-connection-search-in-graph-queries.pdf">slides</a>, <a href="https://youtu.be/LQPnmcrkUpY">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Maciej Besta (ETH Zurich)</td> +<td>Neural Graph Databases with Graph Neural Networks – <a href="https://youtu.be/ce5qNievRNs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:10</td> +<td>Longbin Lai (Alibaba Damo Academy)</td> +<td>To Revisit Benchmarking Graph Analytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/longbin-lai-benchmark-ldbc.pdf">slides</a>, <a href="https://youtu.be/s9Vtt-6t_FI">video</a></td> +</tr> +<tr> +<td>12:15</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>The World of Graph Databases from An Industry Perspective – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/yuanyuan-tian-world-of-graph-databases.pdf">slides</a>, <a href="https://youtu.be/AZuP_b95GPM">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Alin Deutsch (UC San Diego &amp; TigerGraph)</td> +<td>TigerGraph&rsquo;s Parallel Computation Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alin-deutsch-tigergraphs-computation-model.pdf">slides</a>, <a href="https://youtu.be/vcxdieJB80Y">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Chen Zhang (CreateLink)</td> +<td>Applications of a Native Distributed Graph Database in the Financial Industry – <a href="https://youtu.be/GCCT79Sps9I">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Ricky Sun (Ultipa)</td> +<td>Design of highly scalable graph database systems – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ricky-sun-ultipa.pdf">slides</a>, <a href="https://youtu.be/Sg1F64O4vGM">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Heng Lin (Ant Group)</td> +<td>The LDBC SNB implementation in TuGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-the-ldbc-snb-implementation-in-tugraph.pdf">slides</a>, <a href="https://youtu.be/fy8AuVerwnY">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>Shipeng Qi (Ant Group)</td> +<td>FinBench: The new LDBC benchmark targeting financial scenario – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/shipeng-qi-finbench.pdf">slides</a>, <a href="https://youtu.be/0xLZadDOfZk">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>17:00</td> +<td>host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>FinBench panel – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-finbench-panel.pdf">slides</a></td> +</tr> +<tr> +<td>19:00</td> +<td>22:00</td> +<td><em>dinner</em></td> +<td><em><a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a></em></td> +</tr> +</tbody> +</table> +<h4 id="saturday">Saturday</h4> +<p><strong>Location:</strong> Amazon Nitro South building, <strong>room 03.204</strong> (<a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:00</td> +<td>09:45</td> +<td>Brad Bebee (AWS)</td> +<td>Customers don&rsquo;t want a graph database, so why are we still here? – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/brad-bebee-tuc-keynote.pdf">slides</a>, <a href="https://youtu.be/bJlkpDC--fM">video</a></td> +</tr> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Fast and Efficient Update Handling for Graph H2TAP – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/muhammad-attahir-jibril-fast-and-efficient-update-handling-for-graph-h2tap.pdf">slides</a>, <a href="https://youtu.be/e8ZAszBsXV0">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Gabor Szarnyas (CWI)</td> +<td>LDBC Social Network Benchmark and Graphalytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-social-network-benchmark-and-graphalytics.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:30</td> +<td>Atanas Kiryakov and Tomas Kovachev (Ontotext)</td> +<td>GraphDB – Benchmarking against LDBC SNB &amp; SPB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tomas-kovatchev-atanas-kiryakov-benchmarking-graphdb-with-snb-and-spb.pdf">slides</a>, <a href="https://youtu.be/U6OPpNFOWqg">video</a></td> +</tr> +<tr> +<td>11:35</td> +<td>11:50</td> +<td>Roi Lipman (Redis Labs)</td> +<td>Delta sparse matrices within RedisGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/roi-lipman-delta-matrix.pdf">slides</a>, <a href="https://youtu.be/qfKsplV4Ihk">video</a></td> +</tr> +<tr> +<td>11:55</td> +<td>12:05</td> +<td>Rathijit Sen (Microsoft)</td> +<td>Microarchitectural Analysis of Graph BI Queries on RDBMS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/rathijit-sen-microarchitectural-analysis.pdf">slides</a>, <a href="https://youtu.be/55B8CkH09js">video</a></td> +</tr> +<tr> +<td>12:10</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td><em>on your own</em></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Alastair Green (JCC)</td> +<td>LEX &ndash; LDBC Extended GQL Schema – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-lex.pdf">slides</a>, <a href="https://youtu.be/DVpeb4Ce9Uw">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Ora Lassila (AWS)</td> +<td>Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ora-lassila-why-limit-yourself-to-lpg-when-you-can-do-rdf-too.pdf">slides</a>, <a href="https://youtu.be/7uAInoUwdds">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Jan Hidders (Birkbeck, University of London)</td> +<td>PG-Schema: a proposal for a schema language for property graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/jan-hidders-pg-schema.pdf">slides</a>, <a href="https://youtu.be/yQNL8hBTE4M">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Max de Marzi (RageDB and RelationalAI)</td> +<td>RageDB: Building a Graph Database in Anger – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/max-de-marzi-ragedb-building-a-graph-database-in-anger.pdf">slides</a>, <a href="https://youtu.be/LBbF8aslYFE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Umit Catalyurek (AWS)</td> +<td>HPC Graph Analytics on the OneGraph Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/umit-catalyurek-onegraph-hpc.pdf">slides</a>, <a href="https://youtu.be/64tv5LA6Wr8">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>David J. Haglin (Trovares)</td> +<td>How LDBC impacts Trovares – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/david-haglin-trovares.pdf">slides</a>, <a href="">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>16:25</td> +<td>Wenyuan Yu (Alibaba Damo Academy)</td> +<td>GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/wenyuan-yu-graphscope-flex.pdf">slides</a>, <a href="https://youtu.be/cRikoyDmMks">video</a></td> +</tr> +<tr> +<td>16:30</td> +<td>16:40</td> +<td>Scott McMillan (Carnegie Mellon University)</td> +<td>Graph processing using GraphBLAS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/scott-mcmillan-graph-processing-using-graphblas.pdf">slides</a>, <a href="https://youtu.be/yb4hGBhUzQQ">video</a></td> +</tr> +<tr> +<td>16:45</td> +<td>16:55</td> +<td>Tim Mattson (Intel)</td> +<td>Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tim-mattson-graphblas-and-tiledb.pdf">slides</a></td> +</tr> +<tr> +<td>17:00</td> +<td>20:00</td> +<td><em>happy hour (rooftop grill with beverages)</em></td> +<td><em>on the Nitro South building&rsquo;s 8th floor deck</em></td> +</tr> +</tbody> +</table> +<h4 id="tuc-event-locations">TUC event locations</h4> +<p>A <a href="https://www.google.com/maps/d/u/0/edit?mid=19_fi4fV-3-PZkNWCCcmhU86ct2EZXbgo">map of the LDBC TUC events</a> we hosted so far.</p> + + + + + LDBC SNB – Early 2023 updates + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + Wed, 15 Feb 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + <p>2023 has been an eventful year for us so far. Here is a summary of our recent activities.</p> +<ol> +<li> +<p>Our paper <a href="https://ldbcouncil.org/docs/papers/ldbc-snb-bi-vldb-2022.pdf">The LDBC Social Network Benchmark: Business Intelligence Workload</a> was published in PVLDB.</p> +</li> +<li> +<p>David Püroja just completed his MSc thesis on creating a design towards <a href="https://ldbcouncil.org/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf">SNB Interactive v2</a> at CWI&rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference&rsquo;s graph developer room titled <a href="https://fosdem.org/2023/schedule/event/graph_ldbc/">The LDBC Social Network Benchmark</a> (<a href="https://www.youtube.com/watch?v=YNF6z6gtXY4">YouTube mirror</a>).</p> +</li> +<li> +<p>I gave a lightning talk at FOSDEM&rsquo;s HPC developer room titled <a href="https://www.youtube.com/watch?v=q26DHnQFw54">The LDBC Benchmark Suite</a> (<a href="https://www.youtube.com/watch?v=q26DHnQFw54">YouTube mirror</a>).</p> +</li> +<li> +<p>Our auditors have successfully benchmark a number of systems:</p> +<ul> +<li>SPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze)</li> +<li>SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja)</li> +<li>SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr)</li> +</ul> +</li> +</ol> +<p>The results and the full disclosure reports are available under the <a href="https://ldbcouncil.org/benchmarks/spb/">SPB</a> and <a href="https://ldbcouncil.org/benchmarks/snb/">SNB benchmark pages</a>.</p> + + + + + LDBC SNB Datagen – The winding path to SF100K + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + Tue, 13 Sep 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + <p>LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">last technical update</a> on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark&rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.</p> +<h1 id="moving-to-sparksql">Moving to SparkSQL</h1> +<p>We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL&rsquo;s capabilites.</p> +<blockquote> +<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.</p> +</blockquote> +<p>Dealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup>, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators&rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:</p> +<ol> +<li>table manipulations related to shaping the output into the supported layouts and data types as set forth in the specification;</li> +<li>deriving the Interactive and BI datasets;</li> +<li>and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes.</li> +</ol> +<p>We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.</p> +<p><img src="datagen_df_0.png" alt="Datagen stages"></p> +<p>The diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there&rsquo;s no simple way to avoid<br> +it, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.</p> +<p>I&rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">previous blogpost in the series</a> or the <a href="https://arxiv.org/abs/2001.02299">Interactive benchmark specification</a>.</p> +<h1 id="transformation-pipeline">Transformation pipeline</h1> +<p>The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:</p> +<ul> +<li>explodes edges and / or attributes into separate tables,</li> +<li>subsets the snapshot part and creates insert / delete batches for the BI workload,</li> +<li>subsets the snapshot part for the Interactive workload,</li> +<li>applies formatting related options such as date time representation,</li> +<li>serializes the data to a Spark supported format (CSV, Parquet),</li> +</ul> +<p>We utilize a flexible data pipeline that operates on the graph.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span>, <span style="color:#66d9ef">M2</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">]</span> <span style="color:#a6e22e">extends</span> <span style="color:#f92672">(</span><span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">])</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">In</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> apply<span style="color:#f92672">(</span>v<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">])</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>v<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>The <code>Transform</code> trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let&rsquo;s see some of the transformations we have.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToBiTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">BI</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> keepImplicitDeletes<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.BI</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeEdges</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Therefore, a transformation pipeline may look like this:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>params<span style="color:#f92672">,</span> start<span style="color:#f92672">,</span> end<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>inputGraph<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>The <code>Graph</code> record has a <code>definition</code> field containing graph-global metadata, whereas <code>entities</code> holds the datasets keyed by their entity type. There are 3 graph <em>modes</em> currently: <code>Raw</code>, <code>Interactive</code> and <code>BI</code>. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the <code>Layout</code> dependent type, over which the entities are polymorphic.</p> +<p>It&rsquo;s important to understand that <code>Graph</code> holds <code>DataFrame</code>s, and these are lazily computed by Spark. So, <code>Graph</code> is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> isAttrExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> isEdgesExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> useTimestamp<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">M</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">Option</span><span style="color:#f92672">[</span><span style="color:#66d9ef">String</span><span style="color:#f92672">]]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> definition<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M</span><span style="color:#f92672">],</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">M</span><span style="color:#66d9ef">#</span><span style="color:#66d9ef">Layout</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">sealed</span> <span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Raw</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Interactive</span><span style="color:#f92672">(</span>bulkLoadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">BI</span><span style="color:#f92672">(</span>bulkloadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">,</span> batchPeriod<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">String</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">BatchedEntity</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>You may notice that <code>Transform</code> is statically typed w.r.t. <code>Mode</code>, however other properties, like <code>isAttrExploded</code>, or <code>isEdgesExploded</code> are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.</p> +<p>As we already mentioned, <code>Graph</code> is essentially a persistent container of <code>EntityType -&gt; DataFrame</code> mappings. <code>EntityType</code> can be <code>Node</code>, <code>Edge</code> and <code>Attr</code>, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.</p> +<p>Usually, a graph transformation involves matching entities based on their <code>EntityType</code>, and modifying the mapping (and if required, other metadata). Take, for example, the <code>ExplodeAttrs</code> transformation, which explodes into separate tables the values of two columns of <code>Person</code> stored as arrays:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">if</span> <span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#75715e">// assert at runtime that the transformation hasn&#39;t been applied yet +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#66d9ef">throw</span> <span style="color:#66d9ef">new</span> <span style="color:#a6e22e">AssertionError</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Attributes already exploded in the input graph&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> explodedAttr<span style="color:#f92672">(</span>attr<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Attr</span><span style="color:#f92672">,</span> node<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">DataFrame</span><span style="color:#f92672">,</span> column<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Column</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">=</span> +</span></span><span style="display:flex;"><span> attr <span style="color:#f92672">-&gt;</span> node<span style="color:#f92672">.</span>select<span style="color:#f92672">(</span>withRawColumns<span style="color:#f92672">(</span>attr<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>parent<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">),</span> explode<span style="color:#f92672">(</span>split<span style="color:#f92672">(</span>column<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;;&#34;</span><span style="color:#f92672">)).</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>attribute<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">)))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> modifiedEntities <span style="color:#66d9ef">=</span> input<span style="color:#f92672">.</span>entities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>collect <span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k <span style="color:#66d9ef">@</span> <span style="color:#a6e22e">Node</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person&#34;</span><span style="color:#f92672">,</span> <span style="color:#66d9ef">false</span><span style="color:#f92672">),</span> df<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#75715e">// match the Person node. This is the only one ExplodeAttrs should modify +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#a6e22e">Map</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Email&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;EmailAddress&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonEmailEmailAddress&#34; entity derived by exploding the email column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Speaks&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;Language&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonSpeaksLanguage&#34; entity derived by exploding the language column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> k <span style="color:#f92672">-&gt;</span> df<span style="color:#f92672">.</span>drop<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">)</span> <span style="color:#75715e">// drop the exploded columns from person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntities <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>entities<span style="color:#f92672">)(</span><span style="color:#66d9ef">_</span> <span style="color:#f92672">++</span> <span style="color:#66d9ef">_</span><span style="color:#f92672">)</span> <span style="color:#75715e">// merge-replace the modified entities in the graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntityDefinitions <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#f92672">(</span>e<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> e <span style="color:#f92672">++</span> v<span style="color:#f92672">.</span>map<span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> k <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Some</span><span style="color:#f92672">(</span>v<span style="color:#f92672">.</span>schema<span style="color:#f92672">.</span>toDDL<span style="color:#f92672">)</span> <span style="color:#f92672">}</span> <span style="color:#75715e">// update the entity definition schema to reflect the modifications +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> l <span style="color:#66d9ef">=</span> lens<span style="color:#f92672">[</span><span style="color:#66d9ef">In</span><span style="color:#f92672">]</span> <span style="color:#75715e">// lenses provide a terse syntax for modifying nested fields +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">(</span>l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>entities<span style="color:#f92672">).</span>set<span style="color:#f92672">(</span>input<span style="color:#f92672">)((</span><span style="color:#66d9ef">true</span><span style="color:#f92672">,</span> updatedEntityDefinitions<span style="color:#f92672">,</span> updatedEntities<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Note that <code>EntityType</code> does not hold the dataset&rsquo;s full SQL schema currently, as it&rsquo;s not useful for pattern matching, but can be accessed directly from <code>DataFrame</code> if needed.</p> +<h1 id="inputoutput">Input/output</h1> +<p>The <code>Reader</code> and <code>Writer</code> typeclasses are used to read from a <code>Source</code> and write to a <code>Sink</code> respectively, terminating a graph transformation pipeline<br> +on both ends.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">T</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> read<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> exists<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">S</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Data</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> write<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Data</span><span style="color:#f92672">,</span> sink<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">S</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Unit</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>There are implementations under <code>ldbc.datagen.io.instances</code> that read a graph from a <code>GraphSource</code> and write to a <code>GraphSink</code>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> source <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSource</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>read<span style="color:#f92672">(</span>source<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span><span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>graph<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> sink <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSink</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>write<span style="color:#f92672">(</span>transformedGraph<span style="color:#f92672">,</span> sink<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>We provide <a href="https://github.com/typelevel/simulacrum">Ops syntax</a> to make it shorter:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Reader.ops._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Writer.ops._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">).</span>read +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> <span style="color:#f92672">???</span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span>transformedGraph<span style="color:#f92672">.</span>write<span style="color:#f92672">(</span><span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">))</span> +</span></span></code></pre></div><p>The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.</p> +<p>Spark has a facility to derive SparkSQL schema from case classes automatically<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup>. We created case classes for each entity in the <code>Raw</code> dataset. We also created a typeclass <code>EntityTraits</code> associating these classes with their <code>EntityType</code>, so we can summon them (and consequently their SparkSQL schema) in the reader.</p> +<p>The case classes are used during the serialization of the generated dataset too, but more about that later.</p> +<h1 id="factor-generation">Factor generation</h1> +<p>As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.</p> +<p>We decided it&rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator&rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use <a href="https://spark.apache.org/graphx/">GraphX</a>, which was unimaginable with the previous design.</p> +<p>Factor tables are added by extending a map with a <code>name -&gt; Factor</code> pair. <code>Factor</code> declares is input entities, and accepts a function that receives input <code>DataFrames</code>, and returns a single <code>DataFrame</code> as output.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> factors <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Map</span> <span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;personDisjointEmployerPairs&#34;</span> <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Factor</span><span style="color:#f92672">(</span><span style="color:#a6e22e">PersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonKnowsPersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">OrganisationType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonWorkAtCompanyType</span><span style="color:#f92672">)</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">case</span> <span style="color:#a6e22e">Seq</span><span style="color:#f92672">(</span>person<span style="color:#f92672">,</span> personKnowsPerson<span style="color:#f92672">,</span> organisation<span style="color:#f92672">,</span> workAt<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> knows <span style="color:#66d9ef">=</span> undirectedKnows<span style="color:#f92672">(</span>personKnowsPerson<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> company <span style="color:#66d9ef">=</span> organisation<span style="color:#f92672">.</span>where<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;Type&#34;</span> <span style="color:#f92672">===</span> <span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">).</span>cache<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> personSample <span style="color:#66d9ef">=</span> person +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>orderBy<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>limit<span style="color:#f92672">(</span><span style="color:#ae81ff">20</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> personSample +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person2&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>knows<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;knows&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;knows.person2Id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>workAt<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;workAt&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;workAt.PersonId&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;knows.Person1id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>company<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;Company.id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;workAt.CompanyId&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>select<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2id&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.name&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyName&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyId&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.creationDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2creationDate&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.deletionDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2deletionDate&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>distinct<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">},</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* more factors */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span></code></pre></div><p>As you can see, it&rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. <code>undirectedKnows</code>). Currently, there&rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.</p> +<h1 id="revamping-the-data-generators-serializer">Revamping the data generator&rsquo;s serializer</h1> +<p>At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator&rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we&rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.</p> +<blockquote> +<p><a href="https://parquet.apache.org/">Parquet</a> is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.</p> +</blockquote> +<p>The new serialization framework is heavily influenced by the design of Java <code>OutputStreams</code>, in the sense that stateful objects are composed to form a pipeline. For example, in case of <em>activities</em>, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).</p> +<p><img src="activity.png" alt="Activity serialization pipeline"></p> +<p>The benefit of this architecture is that only the last component needs to change when we add support for a new output format.</p> +<p>To support Parquet, we made use of row-level serializers available in Hadoop&rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the <code>Raw</code> entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. <code>Forum</code>) and Spark&rsquo;s <code>Encoder</code> framework to encode the entities in Parquet, which means that the generated output remains consistent with <code>DataFrame</code>-based reader, and we spare a lot of code duplication.</p> +<h1 id="optimizations">Optimizations</h1> +<p>After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation<sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup>. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren&rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf3k_bi <span style="color:#ae81ff">3000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">330</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf10k_bi <span style="color:#ae81ff">10000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">1000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><p>The above examples working configurations for generating the 3K and 10K BI datasets. The <code>--sf-per-executor</code> option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The <code>--partitions</code> option controls the total number of partitions, and was calculated based on the number of persons using the formula <code>partitions = ceil(number_of_persons / block_size / 3)</code> to get a maximum of 3 blocks per partition.</p> +<h1 id="conclusion">Conclusion</h1> +<p>These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.</p> +<h1 id="footnotes">Footnotes</h1> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>Shameless plug: You can learn more on this from <a href="https://www.dataversity.net/case-study-deriving-spark-encoders-and-schemas-using-implicits/">another blogpost of mine</a>.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>The datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>The maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Fifteenth TUC Meeting + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + Fri, 17 Jun 2022 09:20:00 -0500 + + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2022.sigmod.org/venue.shtml">SIGMOD 2022</a> on <strong>June 17-18 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10-15 minute talks followed by a Q&amp;A session. The talks will be recorded and made available online.<br> +The tenative program is the following. <strong>All times are in EDT.</strong></p> +<p>We will have a social event on Friday at 17:30 at <a href="https://elvezrestaurant.com/">El Vez</a> (<a href="https://g.page/ElVezPhilly">Google Maps</a>).</p> +<h4 id="friday-pennsylvania-convention-centerhttpswwwpaconventioncom-room-204bhttps2022sigmodorgprogramshtml">Friday (<a href="https://www.paconvention.com/">Pennsylvania Convention Center</a>, <a href="https://2022.sigmod.org/program.shtml">room 204B</a>)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:20</td> +<td>09:30</td> +<td>Peter Boncz (LDBC/CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/39BoOIGk9Is">video</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Alastair Green (LDBC/Birkbeck)</td> +<td>LDBC&rsquo;s fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-fair-use-of-the-ldbc-trademark.pdf">slides</a>, <a href="https://youtu.be/7zmCysN4Rpg">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)</td> +<td>LDBC Social Network Benchmark: Business Intelligence workload v1.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/gabor-szarnyas-the-ldbc-social-network-benchmark-business-intelligence-workload.pdf">slides</a>, <a href="https://youtu.be/AJ96M8_njxE">video</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Heng Lin (Ant Group)</td> +<td>LDBC Financial Benchmark introduction – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/heng-lin-ldbc-financial-benchmark-introduction.pdf">slides</a>, <a href="https://youtu.be/iBhud_YjafY">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Chen Zhang (CreateLink)</td> +<td>New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/chen-zhang-new-ldbc-snb-benchmark-record-by-galaxybase-more-than-6-times-faster-and-70-percent-higher-throughput.pdf">slides</a>, <a href="https://youtu.be/sMzTsb8iw_Y">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>James Clarkson (Neo4j)</td> +<td>LDBC benchmarks: Promoting good science and industrial consumption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/james-clarkson-ldbc-benchmarks-promoting-good-science-and-industrial-consumption.pdf">slides</a>, <a href="https://youtu.be/VYG1mzcl9qQ">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Oskar van Rest (Oracle)</td> +<td>Creating and querying property graphs in Oracle, on-premise and in the cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oskar-van-rest-creating-and-querying-property-graphs-in-oracle-on-premise-and-in-the-cloud.pdf">slides</a>, <a href="https://youtu.be/2HX2Vixf2gs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:15</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>Conquering LDBC SNB BI at SF-10k – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/mingxi-wu-conquering-ldbc-snb-bi-at-sf10k.pdf">slides</a>, <a href="https://youtu.be/oJbqzQ_t3G8">video</a></td> +</tr> +<tr> +<td>12:20</td> +<td>13:20</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:20</td> +<td>13:35</td> +<td>Altan Birler (Technische Universität München)</td> +<td>Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/altan-birler-relational-databases-can-handle-graphs-too.pdf">slides</a>, <a href="https://youtu.be/cRgbdY3I2i4">video</a></td> +</tr> +<tr> +<td>13:40</td> +<td>13:55</td> +<td>David Püroja (CWI)</td> +<td>LDBC Social Network Benchmark: Interactive workload v2.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/david-puroja-ldbc-snb-interactive-workload-v2.0.pdf">slides</a></td> +</tr> +<tr> +<td>14:00</td> +<td>14:15</td> +<td>Angela Bonifati (Lyon 1 University)</td> +<td>The quest for schemas in graph databases – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/angela-bonifati-the-quest-for-schemas-in-graph-databases.pdf">slides</a>, <a href="https://youtu.be/VT7cx3Jp7V8">video</a></td> +</tr> +<tr> +<td>14:20</td> +<td>14:35</td> +<td>Matteo Lissandrini (Aalborg University)</td> +<td>Understanding graph data representations in triplestores – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/matteo-lissandrini-understanding-graph-data-representations-in-triplestores.pdf">slides</a>, <a href="https://youtu.be/xqVMJZfh_JU">video</a></td> +</tr> +<tr> +<td>14:40</td> +<td>14:55</td> +<td>Wim Martens (University of Bayreuth)</td> +<td>Path representations – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/wim-martens-path-representations.pdf">slides</a>, <a href="https://youtu.be/Ma-E5dwgf-E">video</a></td> +</tr> +<tr> +<td>15:00</td> +<td>15:20</td> +<td>Audrey Cheng (UC Berkeley)</td> +<td>TAOBench: An end-to-end benchmark for social network workloads – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/audrey-cheng-taobench.pdf">slides</a>, <a href="https://youtu.be/1p8AStxS3es">video</a></td> +</tr> +</tbody> +</table> +<h4 id="saturday-philadelphia-marriott-downtownhttpswwwmarriottcomen-ushotelsphldt-philadelphia-marriott-downtown-room-401-402-4th-floor">Saturday (<a href="https://www.marriott.com/en-us/hotels/phldt-philadelphia-marriott-downtown/">Philadelphia Marriott Downtown</a>, room 401-402, 4th floor)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Keith Hare (WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/keith-hare-property-graph-standards-process-and-timing.pdf">slides</a>, <a href="https://youtu.be/xFVD3LWnKlc">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>10:35</td> +<td>Leonid Libkin (ENS Paris)</td> +<td>Pattern matching in GQL and SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/leonid-libkin-pattern-matching-in-gql-and-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/OvGsa0qLANE">video</a></td> +</tr> +<tr> +<td>10:40</td> +<td>10:55</td> +<td>Petra Selmer (Neo4j/WG3)</td> +<td>An overview of GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/petra-selmer-towards-gql-v1-a-property-graph-query-language-standard.pdf">slides</a>, <a href="https://youtu.be/tncf2FgyIyo">video</a></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (LDBC/WG3)</td> +<td>GQL 2.0: A technical manifesto – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-gql-2.0-a-technical-manifesto.pdf">slides</a>, <a href="https://youtu.be/upIvpYy8C2g">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>George Fletcher (TU Eindhoven)</td> +<td>PG-Keys (LDBC Property Graph Schema Working Group) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/george-fletcher-pg-keys-keys-for-property-graphs.pdf">slides</a>, <a href="https://youtu.be/_W8-jOtcObc">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Arvind Shyamsundar (Microsoft)</td> +<td>Graph capabilities in Microsoft SQL Server and Azure SQL Database – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/arvind-shyamsundar-graph-capabilities-in-microsoft-sql-server-and-azure-database.pdf">slides</a>, <a href="https://youtu.be/xxV2BfZupGw">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>13:30</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Daniël ten Wolde (CWI)</td> +<td>Implementing SQL/PGQ in DuckDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/daniel-ten-wolde-implementing-sql-pgq-in-duckdb.pdf">slides</a>, <a href="https://youtu.be/JmSfU0BTH5w">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Oszkár Semeráth, Kristóf Marussy (TU Budapest)</td> +<td>Generation techniques for consistent, realistic, diverse, and scalable graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oszkar-semerath-generation-techniques-for-consistent-realistic-diverse-and-scalable-graphs.pdf">slides</a>, <a href="https://youtu.be/hB6j6mvh-vA">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Molham Aref (RelationalAI)</td> +<td>Graph Normal Form – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/molham-aref-graph-normal-form.pdf">slides</a>, <a href="https://youtu.be/-kP4Raqr5KA">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Naomi Arnold (Queen Mary University of London)</td> +<td>Temporal graph analysis of the far-right social network Gab – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/naomi-arnold-temporal-graph-analysis-of-the-far-right-social-network-gab.pdf">slides</a>, <a href="https://youtu.be/ugSkFlif4PE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:05</td> +<td>Domagoj Vrgoč (PUC Chile)</td> +<td>Evaluating path queries in MillenniumDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/domagoj-vrgoc-regular-path-queries-in-millenniumdb.pdf">slides</a>, <a href="https://youtu.be/_OzJ6vI7GNU">video</a></td> +</tr> +<tr> +<td>15:10</td> +<td>15:25</td> +<td>Pavel Klinov, Evren Sirin (Stardog)</td> +<td>Stardog&rsquo;s experience with LDBC – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/evren-sirin-stardog-experience-with-ldbc.pdf">slides</a>, <a href="https://youtu.be/CBrEeOTqGKM">video</a></td> +</tr> +</tbody> +</table> + + + + + Announcing the LDBC Financial Benchmark Task Force + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + Thu, 26 May 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + <p>We are delighted to announce the set up of the <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench) task force</a>.</p> +<p>The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.</p> +<p>Compared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.</p> +<p>The FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">Work Charter for FinBench</a></p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.</p> + + + + + Fourteenth TUC Meeting + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + Mon, 16 Aug 2021 16:00:00 +0200 + + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + <p>LDBC was hosting a one-day hybrid workshop, co-located with <a href="https://vldb.org/2021/">VLDB 2021</a> on <strong>August 16 (Monday) between 16:00–20:00 CEST</strong>.</p> +<p>The physical part of the workshop was held in room Akvariet 2 of the <a href="https://www.tivolihotel.com/">Tivoli Hotel</a> (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC&rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.</p> +<p>Talks were scheduled to be 10 minutes with a short Q&amp;A session. We had three sessions. Their schedules are shown below.</p> +<h4 id="16001725-cest-ldbc-updates-benchmarks-query-languages">[16:00–17:25 CEST] LDBC updates, benchmarks, query languages</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>16:00</td> +<td>Peter Boncz (CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a></td> +</tr> +<tr> +<td>16:05</td> +<td>Gábor Szárnyas (CWI)</td> +<td>Overview of LDBC benchmarks – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-benchmarks.pdf">slides</a></td> +</tr> +<tr> +<td>16:12</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>LDBC Social Network Benchmark results with TigerGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mingxi-wu-tigergraph-snb-preliminary-results.pdf">slides</a></td> +</tr> +<tr> +<td>16:24</td> +<td>Xiaowei Zhu (Ant Group)</td> +<td>Financial Benchmark proposal – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/xiaowei-zhu-financial-benchmark.pdf">slides</a></td> +</tr> +<tr> +<td>16:36</td> +<td>Petra Selmer (Neo4j)</td> +<td>Status report from the Existing Languages Working Group (ELWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/petra-selmer-elwg.pdf">slides</a>, <a href="https://youtu.be/I5A8VuFDhsA">video</a></td> +</tr> +<tr> +<td>16:48</td> +<td>Jan Hidders (Birkbeck)</td> +<td>Status report from the Property Graph Schema Working Group (PGSWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/jan-hidders-pgswg.pdf">slides</a>, <a href="https://youtu.be/iEbVi9T-HVk">video</a></td> +</tr> +<tr> +<td>17:00</td> +<td>Keith Hare (JCC Consulting)</td> +<td>Database Language Standards Structure and Process, SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/keith-hare-database-language-standards-structure-and-process-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/ZgFCuzods4g">video</a></td> +</tr> +<tr> +<td>17:12</td> +<td>Stefan Plantikow (GQL Editor)</td> +<td>Report on the GQL standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/stefan-plantikow-gql.pdf">slides</a>, <a href="https://youtu.be/z0pN5NwKsgc">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="17351845-cest-systems-and-data-structures">[17:35–18:45 CEST] Systems and data structures</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>17:35</td> +<td>Vasileios Trigonakis (Oracle Labs)</td> +<td>PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasileios-trigonakis-pgxd-adfs.pdf">slides</a>, <a href="https://youtu.be/cv2ZfWRBOek">video</a></td> +</tr> +<tr> +<td>17:47</td> +<td>Matthias Hauck (SAP)</td> +<td>JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/matthias-hauck-json-spatial-graph-sap-hana-cloud.pdf">slides</a>, <a href="https://youtu.be/dgpMJFho6Q8">video</a></td> +</tr> +<tr> +<td>17:59</td> +<td>Nikolay Yakovets (Eindhoven University of Technology)</td> +<td>AvantGraph – <a href="https://youtu.be/z0pN5NwKsgcttachments/nikolay-yakovets-avantgraph.pdf">slides</a>, <a href="https://youtu.be/9M9FOycovTw">video</a></td> +</tr> +<tr> +<td>18:11</td> +<td>Semih Salihoglu (University of Waterloo)</td> +<td>GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semih-salihoglu-graindb.pdf">slides</a>, <a href="https://youtu.be/FFK3y6vPHJs">video</a></td> +</tr> +<tr> +<td>18:23</td> +<td>Semyon Grigorev (Saint Petersburg University)</td> +<td>Context-free path querying: Obstacles on the way to adoption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semyon-grigorev-cfpq.pdf">slides</a>, <a href="https://youtu.be/pha1xIpEL3I">video</a></td> +</tr> +<tr> +<td>18:35</td> +<td>Per Fuchs (Technical University of Munich)</td> +<td>Sortledton: A universal, transactional graph data structure – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/per-fuchs-sortledton.pdf">slides</a>, <a href="https://youtu.be/33ZjsNN0hhU">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="1855-2000-cest-high-level-approaches-and-benchmarks">[18:55-20:00 CEST] High-level approaches and benchmarks</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>18:55</td> +<td>Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)</td> +<td>Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/angelos-christos-anadiotis-investigative-journalism-graph-data-management.pdf">slides</a>, <a href="https://youtu.be/a1VYjyec8dg">video</a></td> +</tr> +<tr> +<td>19:07</td> +<td>Vasia Kalavri (Boston University)</td> +<td>Learning to partition unbounded graph streams – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasia-kalavri-learning-to-partition-unbounded-graph-streams.pdf">slides</a>, <a href="https://youtu.be/PTlUABKWniA">video</a></td> +</tr> +<tr> +<td>19:19</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Towards a Hybrid OLTP-OLAP Graph Benchmark – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/muhammad-attahir-jibril-hybrid-oltp-olap-benchmark.pdf">slides</a>, <a href="https://youtu.be/tMBVszTSJXc">video</a></td> +</tr> +<tr> +<td>19:31</td> +<td>Riccardo Tommasini (University of Tartu)</td> +<td>An outlook on Benchmarks for Graph Stream Processing – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/riccardo-tommasini-graph-stream-processing-benchmarks.pdf">slides</a>, <a href="https://youtu.be/HabvJvPXsLc">video</a></td> +</tr> +<tr> +<td>19:43</td> +<td>Mohamed Ragab (University of Tartu)</td> +<td>Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mohamed-ragab-benchranking.pdf">slides</a>, <a href="https://youtu.be/mZ8LhGUq7Wg">video</a></td> +</tr> +</tbody> +</table> + + + + + Thirteenth TUC Meeting + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + Tue, 30 Jun 2020 14:00:00 +0000 + + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + <p>LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.</p> +<h3 id="snb-task-force">SNB Task Force</h3> +<ul> +<li>Progress report +<ul> +<li>ACID compliance test suite</li> +<li>Integrating deletions to Datagen</li> +<li>Migrating Datagen to Spark</li> +<li>Redesign of BI read queries</li> +<li>Extensions to the driver</li> +</ul> +</li> +<li>Ongoing work +<ul> +<li>Datagen: tuning the distribution of deletes</li> +<li>Interactive 2.0 workload</li> +<li>BI 1.0 workload</li> +</ul> +</li> +</ul> +<p>Zoom links will be sent through email.</p> + + + + + Speeding Up LDBC SNB Datagen + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + Fri, 12 Jun 2020 00:00:00 +0000 + + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + <p>LDBC&rsquo;s <a href="#references">Social Network Benchmark [4]</a> (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems&rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.</p> +<p>LDBC SNB provides <a href="https://github.com/ldbc/ldbc_snb_datagen">Datagen</a> (Data Generator), which produces synthetic datasets, mimicking a social network&rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).</p> +<h2 id="overview">Overview</h2> +<p>The benchmark&rsquo;s specification describes a social network <a href="https://github.com/ldbc/ldbc_snb_docs/blob/9253abbde94ec7eaccd366c5d4c15cca30752e36/figures/schema-comfortable.pdf">data model</a> which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see <a href="#references">References</a>.</p> +<p>Datasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).</p> +<p><img src="datagen_flow.png" alt=""> \ <em>Figure 1. LDBC SNB Datagen Process on Hadoop</em></p> +<p>In the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.</p> +<p><em>Note: The diagram shows the call sequence as implemented. All steps are sequential &ndash; including the relationship generation &ndash;, even in cases when the data dependencies would allow for parallelization.</em></p> +<p>Entities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup> A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this <em>edge as property</em> representation makes the data harder to handle in SQL than it would be with a flat join table.</p> +<p>Entity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup></p> +<p>Serialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.</p> +<h2 id="motivations-for-the-migration">Motivations for the migration</h2> +<p>The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:</p> +<ul> +<li> +<p><strong>Better memory utilization:</strong> MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.</p> +</li> +<li> +<p><strong>Smaller codebase:</strong> The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.</p> +</li> +<li> +<p><strong>Small entry cost:</strong> Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.</p> +</li> +<li> +<p><strong>Incremental improvements:</strong> Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.</p> +</li> +<li> +<p><strong>OSS, commodity:</strong> Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.</p> +</li> +</ul> +<h2 id="first-steps">First steps</h2> +<p>The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.</p> +<ul> +<li> +<p><strong>Use your memory:</strong> A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using <code>MEMORY_AND_DISK</code>). In short, the default caching strategy was used everywhere.</p> +</li> +<li> +<p><strong>Regression tests:</strong> Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.</p> +</li> +<li> +<p><strong>Thread-safety concerns:</strong> Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn&rsquo;t a concern in the original implementation due to the fact that MapReduce doesn&rsquo;t use thread-based parallelization for mappers and reducers.<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, <sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup> which makes it somewhat harder to find potential unguarded shared variables.</p> +</li> +</ul> +<h2 id="case-study-person-ranking">Case study: Person ranking</h2> +<p>Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the <a href="#references">S3G2 paper [3]</a>.</p> +<h3 id="the-original-mapreduce-version">The original MapReduce version</h3> +<p><img src="person_ranking.svg" alt=""> \ <em>Figure 2. Diagram of the MapReduce code for ranking persons</em></p> +<p>The implementation, shown in pseudocode above, works as follows:</p> +<ol> +<li>The equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low.</li> +<li>The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate &ldquo;side-channel&rdquo; file upon the completion of a reduce task.</li> +<li>In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person.</li> +</ol> +<p>Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.</p> +<h3 id="the-migrated-version">The migrated version</h3> +<p>Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.</p> +<h2 id="benchmarks">Benchmarks</h2> +<p>Benchmarks were carried out on AWS <a href="https://aws.amazon.com/emr/">EMR</a>, originally utilising <a href="https://aws.amazon.com/ec2/instance-types/i3/">i3.xlarge</a> instances because of their fast NVMe SSD storage and ample amount of RAM.</p> +<p>The application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.<sup id="fnref:5"><a href="#fn:5" class="footnote-ref" role="doc-noteref">5</a></sup> The MapReduce results were as follows:</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>16</td> +<td>1.60</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>34</td> +<td>1.13</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>40</td> +<td>1.20</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>44</td> +<td>1.32</td> +</tr> +</tbody> +</table> +<p>It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.</p> +<p><img src="mr_sf100_cpu_load.png" alt=""> <br> +<em>Figure 3. CPU Load for the Map Reduce cluster is bursty and less than<br> +50% on average (SF100, 2nd graph shows master)</em></p> +<p><img src="mr_sf100_mem_free.png" alt=""> <br> +<em>Figure 4. The job only starts to consume memory when already 10 minutes<br> +into the run (SF100, 2nd graph shows master)</em></p> +<p>Let&rsquo;s see how Spark fares.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>10</td> +<td>1.00</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>21</td> +<td>0.70</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>27</td> +<td>0.81</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>36</td> +<td>1.08</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +</tbody> +</table> +<p>A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn&rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.</p> +<p><img src="spark_sf100_cpu_load.png" alt=""> <br> +<em>Figure 5. Full CPU utilization for Spark (SF100, last graph shows<br> +master)</em></p> +<p><img src="spark_sf100_mem_free.png" alt=""> <br> +<em>Figure 6. Spark eats up memory fast (SF100, 2nd graph shows master)</em></p> +<p>i3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it <em>only</em> has a 300 GB SSD.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>16</td> +<td>0.48</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>21</td> +<td>0.63</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>26</td> +<td>0.78</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +<tr> +<td>10000</td> +<td>303</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<p>The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.</p> +<h2 id="next-steps">Next steps</h2> +<p>The next improvement is refactoring the serializers so they use Spark&rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.</p> +<p>As already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.</p> +<p>The Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for <a href="#references">generating delete operations [1]</a>. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!</p> +<h2 id="references">References</h2> +<p>[1] <a href="https://ldbcouncil.org/docs/papers/datagen-deletes-grades-nda-2020.pdf">Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark&rsquo;s Data Generator</a></p> +<p>[2] <a href="https://www.youtube.com/watch?v=ZQOLuCOOpSI">9th TUC Meeting &ndash; LDBC SNB Datagen Update &ndash; Arnau Prat (UPC)</a> - <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">slides</a></p> +<p>[3] <a href="https://research.vu.nl/en/publications/s3g2-a-scalable-structure-correlated-social-graph-generator">S3G2: a Scalable Structure-correlated Social Graph Generator</a></p> +<p>[4] <a href="https://arxiv.org/abs/2001.02299">The LDBC Social Network Benchmark</a></p> +<p>[5] <a href="https://ldbcouncil.org/">LDBC</a> - <a href="https://github.com/ldbc">LDBC GitHub organization</a></p> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>Also makes it easier to map to a tabular format thus it is a SQL friendly representation.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>It&rsquo;s hard to imagine this done declaratively in SQL.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>Instead, multiple YARN containers have to be used if you want to parallelize on the same machine.&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>Although editors usually render these using different font styles.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:5"> +<p>With the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.&#160;<a href="#fnref:5" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Twelfth TUC Meeting + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + Fri, 05 Jul 2019 08:30:00 +0100 + + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + <p>LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event on the last Friday of <strong><a href="https://sigmod2019.org/">SIGMOD/PODS 2019</a></strong> in Amsterdam, The Netherlands, in the conference venue of <strong><a href="http://sigmod2019.org/conf_venue">Beurs van Berlage</a></strong>. The room is the Mendes da Silva kamer. Please check its tips for <strong><a href="http://sigmod2019.org/accommodation">accommodation in Amsterdam</a></strong>.</p> +<p>Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2019">GRADES-NDA 2019</a>, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.</p> +<p><strong>Talk proposals can be sent to Peter Boncz</strong>, who is also the local organizer. <strong>Please also send your slides to this email for archiving on this site.</strong></p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting, there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).</p> +<p>The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (<strong>room: Mendes da Silva kamer</strong>):</p> +<p>08:30-10:30 LDBC Board Meeting (non-public)</p> +<p>10:30-11:00 Coffee</p> +<p>11:00-12:45 Session 1: Graph Benchmarks</p> +<ul> +<li> +<p>11:00-11:05 Welcome &amp; introduction</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230404.pdf">11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706117.pdf">11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706118.pdf">12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706130.pdf">12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics</a></p> +</li> +</ul> +<p>12:45-14:00 Lunch</p> +<p>14:00-16:05 Session 2: Graph Query Languages</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706120.pdf">14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706121.pdf">14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706122.pdf">report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706119.pdf">14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706129.pdf">15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph&rsquo;s query language</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230401.pdf">15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language</a></p> +</li> +</ul> +<p>16:05-16:30 Coffee</p> +<p>16:30-17:50 Session 3: Graph System Performance</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111968258.pdf">16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706124.pdf">16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity</a> <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706116.pptx">pptx</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706128.pdf">17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706133.pdf">17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data</a></p> +</li> +</ul> +<p>If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.</p> + + + + + Eleventh TUC Meeting + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + Fri, 08 Jun 2018 08:30:00 -0500 + + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + <p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event preceding the <a href="https://sigmod2018.org/">SIGMOD/PODS 2018</a> conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2018/">GRADES-NDA 2018</a> as well, so you might combine travel.</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (<a href="mailto:boncz@cwi.nl">boncz@cwi.nl</a>) and Larri (<a href="mailto:larri@ac.upc.ed">larri@ac.upc.edu</a>). Local organizer is Juan Sequeda (<a href="mailto:juanfederico@gmail.com">juanfederico@gmail.com</a>).</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00:</p> +<ul> +<li> +<p>10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090478.pdf">10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo</a></p> +</li> +<li> +<p>11:00-11:30 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090466.pdf">11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090463.pdf">11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090472.pdf">12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI</a></p> +</li> +<li> +<p>12:45-14:00 lunch</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090474.pdf">14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090481.pdf">14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090485.pdf">14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service</a></p> +</li> +<li> +<p>15:15-15:40 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99811329.pdf">15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99287041.pdf">16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99745793.pdf">16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph</a></p> +</li> +<li> +<p><a href="http://relational.ai/">16:55-17:20 Molham Aref (relational.ai)</a>) - Introducing.. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99418113.pdf">relational.ai</a></p> +</li> +<li> +<p>18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701</p> +</li> +</ul> +<h3 id="location">Location</h3> +<p>The TUC will be held at the <a href="https://www.cs.utexas.edu/">University of Texas at Austin, Department of Computer Science</a> in the <a href="https://www.google.com/maps/place/The+University+of+Texas:+Department+of+Computer+Science/@30.2860955,-97.737582,18z/data=!4m5!3m4!1s0x0:0x12edecc8226b3241!8m2!3d30.2862279!4d-97.7365348">Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712</a> Room: GDC 6.302</p> +<p>The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.</p> +<h3 id="from-austin-to-sigmodpods-houston-on-saturday-june-9">From Austin to SIGMOD/PODS (Houston) on Saturday June 9</h3> +<p>Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.</p> +<h4 id="bus">Bus</h4> +<p>One option is to take a <a href="https://us.megabus.com/journey-planner/journeys?days=1&amp;concessionCount=0&amp;departureDate=2018-06-09&amp;destinationId=318&amp;inboundOtherDisabilityCount=0&amp;inboundPcaCount=0&amp;inboundWheelchairSeated=0&amp;nusCount=0&amp;originId=320&amp;otherDisabilityCount=0&amp;pcaCount=0&amp;totalPassengers=1&amp;wheelchairSeated=0">MegaBus that departs from downtown Austin and arrives at downtown Houston</a>.</p> +<p>There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).</p> +<p>If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).</p> + + + + + Tenth TUC Meeting + https://ldbcouncil.org/event/tenth-tuc-meeting/ + Fri, 01 Sep 2017 10:30:00 +0100 + + https://ldbcouncil.org/event/tenth-tuc-meeting/ + <p>This will be a one-day event at the <a href="http://www.vldb.org/2017">VLDB 2017</a> conference in Munich, Germany on September 1, 2017.</p> +<p>Topics and activities of interest in these TUC meetings are:</p> +<ul> +<li>Presentation on graph data management usage scenarios.</li> +<li>Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at <a href="mailto:adiaz@ac.upc.edu">adiaz@ac.upc.edu</a> to register; registration is free, but required.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00</p> +<p>10:30-12:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87588865.pdf">Peter Boncz (CWI): GraphQL task force update - the G-CORE proposal</a> (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868018.pptx">pptx</a>)</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868008.pdf">Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload</a></li> +<li>Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868014.pdf">LDBC Graphalytics v0.9</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868013.pdf">Graphalytics Global Competition and Graphalytics Custom Benchmark</a></li> +</ul> +<p>12:00-13:30: lunch break</p> +<p>13:30-15:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868024.pdf">Arnau Prat (UPC): Datasynth: Democratizing property graph generation</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868026.pdf">Marcus Paradies (SAP): SAP HANA GraphScript</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031809.pdf">Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform</a></li> +<li>Gaétan Hains (Huawei): Cost semantics for graph queries</li> +</ul> +<p>15:00-15:30: break</p> +<p>15:30-17:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031812.pdf">Petra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87195650.pdf">Markus Kaindl (Springer): SN SciGraph &ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain</a></li> +<li>Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks</li> +<li>Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset</li> +</ul> +<p>Speakers should aim for a <strong>20-minute talk</strong>.</p> +<p>Further:</p> +<ul> +<li>on Friday evening (19:00-21:00) there will be a <strong>social dinner</strong> at <a href="https://www.loewenbraeukeller.com/en/pub-and-beer-garden/">Löwenbräukeller</a>, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).</li> +<li>on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.</li> +</ul> +<h3 id="venue">Venue</h3> +<p>The Technical University of Munich (TUM) is hosting that week the <a href="http://www.vldb.org/2017">VLDB conference</a>; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.</p> +<p>The TUC meeting will be held in in <strong>Room 2607</strong> alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).</p> +<p><strong>address: Technische Universität München (TUM), Arcisstraße 21, 80333 München</strong></p> +<p><a href="https://www.google.nl/maps/place/Technische+Universit%C3%A4t+M%C3%BCnchen/@48.14966,11.5656715,17z/data=!3m1!4b1!4m5!3m4!1s0x479e7261336d8c11:0x79a04d44dc5bf19d!8m2!3d48.14966!4d11.5678602?hl=en">Google Maps</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920002.jpg" alt=""><br> +<img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920003.jpg" alt=""></p> + + + + + Ninth TUC Meeting + https://ldbcouncil.org/event/ninth-tuc-meeting/ + Thu, 09 Feb 2017 15:07:18 -0400 + + https://ldbcouncil.org/event/ninth-tuc-meeting/ + <p>LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> in Walldorf, Germany on February 9+10, 2017.</p> +<p>This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>;</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.</p> +<p>Thursday evening (19:00-21:00) there will be a <strong>social dinner</strong> in Heidelberg.</p> +<p>Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.</p> +<h4 id="social-dinner">Social Dinner</h4> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235334.png" alt=""></p> +<p><strong>Address: Hauptstraße 217, 69117 Heidelberg</strong><br> +<strong>Time: 19:00 / 7pm</strong></p> +<p>(See attachments at the bottom of the page)</p> +<h5 id="thursday">Thursday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>Welcome and logistics - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>9:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235329.pdf">Intro + state of the LDBC - Josep Lluis Larriba Pey</a> (UPC)</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235338.pdf">LDBC Graph QL task force</a> - Hannes Voigt (TU Dresden)</td> +</tr> +<tr> +<td>9:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235335.pdf">PGQL Status Update and Comparison to LDBC&rsquo;s Graph QL proposals</a> - Oskar van Rest (Oracle Labs)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628546.pdf">Adding shortest-paths to MonetDB</a> - Dean de Leo (CWI)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431939.pdf">Evolving Cypher for processing multiple graphs</a> - Stefan Plantikow (Neo Technology)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235346.pdf">Standardizing Graph Database Functionality - An Invitation to Collaborate</a> - Jan Michels (ISO/ANSI SQL, Oracle)&quot;</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235343.pdf">Dgraph: Graph database for production environment</a> - Tomasz Zdybal (Dgraph.io)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431945.pdf">LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap</a> - Alexandru Iosup (TU Delft)</td> +</tr> +<tr> +<td>13:20</td> +<td>LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)</td> +</tr> +<tr> +<td>13:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">LDBC SNB Datagen Update</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431943.pdf">LDBC SNB Business Intelligence Workload: Chokepoint Analysis</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431947.pdf">LDBC Benchmark Cost Specification</a> (+discussion) - Moritz Kaufmann (TU Munich)</td> +</tr> +<tr> +<td>14:40</td> +<td>coffee break</td> +</tr> +<tr> +<td>15:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76316673.pdf">EYWA: the Distributed Graph Engine in Huawei MIND Platform</a> (Yinglong Xia)</td> +</tr> +<tr> +<td>15:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431949.pdf">Graph Processing in SAP HANA</a> - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>15:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628563.pdf">Distributed Graph Analytics with Gradoop</a> - Martin Junghanns (Univ Leipzig)</td> +</tr> +<tr> +<td>16:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152834.pdf">Distributed graph flows: Cypher on Flink and Gradoop</a> - Max Kießling (Neo Technology)</td> +</tr> +<tr> +<td>16:30</td> +<td>closing - Peter Boncz</td> +</tr> +<tr> +<td>17:30</td> +<td>end</td> +</tr> +</tbody> +</table> +<h5 id="friday">Friday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>welcome - Peter Boncz</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152833.pdf">Graph processing in obi4wan</a> - Frank Smit (OBI4WAN)</td> +</tr> +<tr> +<td>9:40</td> +<td>Graph problems in the space domain - Albrecht Schmidt (ESA)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75792387.pdf">Medical Ontologies for Healthcare</a> - Michael Neumann (SAP)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76447745.pdf">The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries</a> - Gabor Szarnyas (BME)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76021761.pdf">Efficient sparse matrix computations and their generalization to graph computing applications</a> - Albert-Jan Yzelman (Huawei)</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152837.pdf">Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge</a> - Atanas Kyriakov (Ontotext)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td>LDBC Board of Directors Meeting</td> +</tr> +<tr> +<td>17:00</td> +<td>end</td> +</tr> +</tbody> +</table> +<h3 id="logistics">Logistics</h3> +<h5 id="important-things-to-know"><strong>Important things to know</strong></h5> +<p>The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">link</a></p> +<h5 id="venue"><strong>Venue</strong></h5> +<p>The TUC meeting will be held in the <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> at the SAP Guesthouse Kalipeh (<a href="https://www.kalipeh.com">https://www.kalipeh.com</a>). The address is:</p> +<p><strong>WDF 44 / SAP Guesthouse Kalipeh<br> +Dietmar-Hopp-Allee 15<br> +69190 Walldorf<br> +Germany</strong></p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p><a href="https://www.google.com/maps/place/SAP+Guesthouse+Kalipeh/@49.2951903,8.6436224,17z/data=!3m1!4b1!4m5!3m4!1s0x4797bea343a566af:0xd70698f3503ab74b!8m2!3d49.2951868!4d8.6458111">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/69042180.png" alt=""></p> +<h4 id="getting-there"><strong>Getting there</strong></h4> +<h5 id="by-plane"><strong>By plane</strong></h5> +<p>There are two airports close to SAP&rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.</p> +<p>When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.</p> +<p>The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).</p> +<p>Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).</p> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<p><strong>Traveling from Frankfurt Airport (FRA) to SAP Headquarters:</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.&rdquo;</li> +<li>Follow the A5 to &ldquo;Basel/Karlsruhe/Heidelberg.&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<p>(Should you use a navigational system which does not recognize the street name &lsquo;Dietmar-Hopp-Allee&rsquo; please use &lsquo;Neurottstrasse&rsquo; instead.)</p> +<p><strong>Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:</strong></p> +<p>To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.</p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A8/Stuttgart/B27.&rdquo;</li> +<li>Stay on A8 and follow the sign for &ldquo;Karlsruhe/Heilbronn/Singen/A8.&rdquo;</li> +<li>Follow A8 to Karlsruhe.</li> +<li>Take exit 41 &ndash; &ldquo;Dreieck Karlsruhe&rdquo; to merge onto A5 toward &ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<h6 id="parking"><strong>Parking</strong></h6> +<p>The closest parking lot to the event location is P7 (see figure above).</p> +<h5 id="by-train"><strong>By Train</strong></h5> +<p>As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.</p> +<p><strong>From Frankfurt Airport (FRA) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to Terminal 1, level T (see overview in Appendix).</li> +<li>Go to the AIRail Terminal &ndash; &ldquo;Fernbahnhof&rdquo; (long-distance trains).</li> +<li>Choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP.&rdquo; It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> +<p><strong>From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to the S-Bahn station in the airport, following the sign (station is called &ldquo;Stuttgart Flughafen/Messe&rdquo;).</li> +<li>Take train number S2 or S3 to &ldquo;Stuttgart Hauptbahnhof&rdquo; (main station).</li> +<li>From Stuttgart Hauptbahnhof choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP&rdquo;. It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> + + + + + LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + Tue, 06 Sep 2016 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + <p>LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.</p> +<p>LDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.</p> +<p>Tim Hegeman of <a href="https://www.tudelft.nl">TU Delft</a> is today presenting the technical paper describing LDBC Graphalytics at the important <a href="https://www.vldb.org/conference.html">VLDB</a> (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.</p> +<p>LDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.</p> +<p>Learn more: [/ldbc-graphalytics](LDBC Graphalytics)</p> +<p>GitHub: <a href="https://github.com/tudelft-atlarge/graphalytics">https://github.com/tudelft-atlarge/graphalytics</a></p> + + + + + Eighth TUC Meeting + https://ldbcouncil.org/event/eighth-tuc-meeting/ + Wed, 22 Jun 2016 14:45:20 -0400 + + https://ldbcouncil.org/event/eighth-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event/eighth-tuc-meeting/attachments at <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a> in Redwood Shores facility on <strong>Wednesday and Thursday June 22-23, 2016</strong>.</p> +<p>This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:</p> +<ul> +<li>Two day event/eighth-tuc-meeting/attachments with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify Oracle security in advance, registration requests need to be in by <strong>June 12</strong>.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +<li><a href="#accommodation">Accommodation</a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.</p> +<h6 id="wednesday-22th-of-june-2016-room-203"><strong>Wednesday, 22th of June 2016 (<strong>Room 203)</strong></strong></h6> +<p>(full morning: LDBC Board of Directors meeting)</p> +<ul> +<li>12:00 - 13:00 Lunch (provided)</li> +<li>13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome.</li> +<li>13:30 - 14:00 Peter Boncz (CWI) <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133891.pdf">LDBC introduction and status update</a>.</li> +<li>14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</li> +<li>14:00 Arnau Prat (DAMA-UPC). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133902.pdf">Social Network Benchmark, Interactive workload</a>.</li> +<li>14:30 Tim Hegeman (TU Delft). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133893.pdf">Social Network Benchmark, Analytics workload</a>.</li> +<li>15:00 - 15:30 Coffee break</li> +<li>15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133897.pdf">Graphing Healthcare Networks: Data, Analytics, and Use Cases.</a></li> +<li>16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133901.pdf">Frappé: Querying and managing evolving code dependency graphs</a>.</li> +<li>16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133895.pdf">UniProt: challenges of a public SPARQL endpoint.</a></li> +</ul> +</li> +<li>17:00 - 18:30 Graph Technologies (chair Peter Boncz) +<ul> +<li>17:00 Eugene I. Chong (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133904.pdf">Balancing Act to improve RDF Query Performance in Oracle Database</a>.</li> +<li>17:30 Lijun Chang (University of New South Wales). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133906.pdf">Efficient Subgraph Matching by Postponing Cartesian Products</a>.</li> +<li>18:00 Weining Qian (East China Normal University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133908.pdf">On Statistical Characteristics of Real-Life Knowledge Graphs</a>.</li> +</ul> +</li> +</ul> +<h6 id="thursday-23th-of-june-2016-room-203"><strong>Thursday, 23th of June 2016 (Room 203)</strong></h6> +<ul> +<li>08:00 - 09:00 Breakfast (provided)</li> +<li>09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) +<ul> +<li>09:00 Peter Boncz (CWI). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133896.pdf">Query Language Task Force status</a></li> +<li>09:45 Marcus Paradies (SAP). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297729.pdf">Social Network Benchmark, Business Intelligence workload</a></li> +</ul> +</li> +<li>10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) +<ul> +<li>10:00 Sergey Edunov (Facebook). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297731.pdf">Generating realistic trillion-edge graphs</a></li> +<li>10:30 George Fletcher (TU Eindhoven). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297733.pdf">An open source framework for schema-driven graph instance and graph query workload generation</a>.</li> +<li>11:00 Yinglong Xia (Huawei Research America): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297735.pdf">An Efficient Big Graph Analytics Platform</a>.</li> +<li>11:30 Zhe Wu (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297737.pdf">Bridging RDF Graph and Property Graph Data Models</a></li> +</ul> +</li> +<li>12:00 - 13:30 Lunch (provided)</li> +<li>13:30 - 15:30 Graph Technologies (chair Arnau Prat) +<ul> +<li>13:30 Tobias Lindaaker (Neo Technology). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297740.pdf">An open standard for graph queries: the Cypher contribution</a></li> +<li>14:00 Arash Termehchy (Oregon State University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297742.pdf">Toward Representation Independent Graph Querying &amp; Analytics</a></li> +<li>14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297745.pdf">In the service of the federation</a></li> +<li>15:00 Nandish Jayaram (Pivotal). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297747.pdf">Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs</a>.</li> +</ul> +</li> +<li>15:30 - 16:00 Coffee break</li> +<li>16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>16:00 Jans Aasman (Franz Inc.). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428806.pdf">Semantic Data Lake for Healthcare</a></li> +<li>16:15 Kevin Madden (Tom Sawyer Software). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428812.pdf">Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis</a></li> +<li>16:45 Juan Sequeda (Capsenta). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428810.pdf">Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources</a></li> +<li>17:15 Kevin Wilkinson (Hewlett Packard Labs). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428808.pdf">LDBC SNB extensions</a></li> +</ul> +</li> +<li>17:45 - 18:15 Closing discussion</li> +</ul> +<h6 id="friday-24th-of-june-2016-room-105"><strong>Friday, 24th of June 2016 (Room 105)</strong></h6> +<p>At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (<strong>GRADES16</strong>).</p> +<p>18:30 social dinner for GRADES registrants (place to be announced)</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>22nd and 23rd June 2016</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a></p> +<p>The address is:</p> +<p><strong>Room 203 (Wed-Thu) &amp; Room 105 (Fri)</strong><br> +<strong>Oracle Conference Center</strong><br> +<strong>350 Oracle Parkway</strong><br> +<strong>Redwood City, CA 94065, USA</strong></p> +<p><strong>Maps and situation</strong></p> +<p><a href="https://www.google.com/maps/place/Oracle+Conference+Center/@37.5322827,-122.2667034,17z/data=!3m1!4b1!4m2!3m1!1s0x808f98b5450e8ca3:0xdc75e8b1c02bbb91">Google Maps link</a></p> +<p>Oracle Campus map:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/40927234.jpg" alt=""></p> +<h5 id="getting-there"><strong>Getting there</strong></h5> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<ul> +<li>[Southbound] <strong>-</strong> Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +<li>[Northbound] <strong>-</strong> Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +</ul> +<h5 id="parking"><strong>Parking</strong></h5> +<p>The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.</p> +<h5 id="public-transport"><strong>Public transport</strong></h5> +<p>Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.</p> +<ul> +<li>Caltrain timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/weekdaytimetable.html</a></li> +<li>Oracle Shuttle timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html</a></li> +</ul> +<p>You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.</p> +<p>Alternatively, SamTrans (San Mateo County&rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.</p> + + + + + LDBC and Apache Flink + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + Mon, 16 Nov 2015 14:47:00 +0000 + + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + <p>Apache Flink <a href="#references">[1]</a> is an open source platform for distributed stream and batch data processing. Flink&rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.</p> +<p><img src="https://flink.apache.org/img/flink-stack-small.png" alt=""></p> +<p>Flink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.</p> +<p>The following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">&gt;</span> text <span style="color:#f92672">=</span> env<span style="color:#f92672">.</span><span style="color:#a6e22e">fromElements</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the past controls the future.&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the present controls the past.&#34;</span><span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>Tuple2<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">,</span> Integer<span style="color:#f92672">&gt;&gt;</span> wordCounts <span style="color:#f92672">=</span> text +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">flatMap</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> LineSplitter<span style="color:#f92672">())</span> <span style="color:#75715e">// splits the line and outputs (word,1) +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>tuples<span style="color:#f92672">.</span><span style="color:#a6e22e">groupBy</span><span style="color:#f92672">(</span><span style="color:#ae81ff">0</span><span style="color:#f92672">)</span> <span style="color:#75715e">// group by word +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">.</span><span style="color:#a6e22e">sum</span><span style="color:#f92672">(</span><span style="color:#ae81ff">1</span><span style="color:#f92672">);</span> <span style="color:#75715e">// sum the 1&#39;s +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>wordCounts<span style="color:#f92672">.</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop <a href="#references">[2]</a>. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing <a href="#references">[3]</a>. Using the class <code>LDBCToFlink</code>, LDBC output files can be read directly from HDFS or from the local file system:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;hdfs:///ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> <span style="color:#75715e">// or &#34;/path/to/social_network&#34; +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> vertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> edges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The tuple classes <code>LDBCVertex</code> and <code>LDBCEdge</code> hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.</p> +<p>Each <code>LDBCVertex</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all vertices * a vertex label (e.g. <code>Person</code>, <code>Comment</code>) * a key-value map of properties including also multivalued properties<br> +(e.g. <code>Person.email</code>)</li> +</ul> +<p>Each <code>LDBCEdge</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all edges</li> +<li>an edge label (e.g. <code>knows</code>, <code>likes</code>)</li> +<li>a source vertex identifier</li> +<li>a target vertex identifier</li> +<li>a key-value map of properties</li> +</ul> +<p>The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label <code>Person</code> and edges with the label <code>knows</code> and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub <a href="#references">[4]</a>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter vertices with label “Person” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> ldbcVertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">VERTEX_CLASS_PERSON</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter edges with label “knows” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> ldbcEdges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">EDGE_CLASS_KNOWS</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly vertices suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> vertices <span style="color:#f92672">=</span> ldbcVertices<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly edges suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Edge<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;&gt;</span> edges <span style="color:#f92672">=</span> ldbcEdges<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>Graph<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;</span> g <span style="color:#f92672">=</span> Graph<span style="color:#f92672">.</span><span style="color:#a6e22e">fromDataSet</span><span style="color:#f92672">(</span>vertices<span style="color:#f92672">,</span> edges<span style="color:#f92672">,</span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// run connected components on the subgraph for 10 iterations +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> components <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> g<span style="color:#f92672">.</span><span style="color:#a6e22e">run</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> ConnectedComponents<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// print the component id of the first 10 vertices +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>components<span style="color:#f92672">.</span><span style="color:#a6e22e">first</span><span style="color:#f92672">(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">).</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The ldbc-flink-import tool is available on Github <a href="#references">[3]</a> and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.</p> +<p>If you want to learn more about Apache Flink, a good starting point is the main documentation <a href="#references">[5]</a> and if you have any question feel free to ask the official mailing lists.<br> +There is also a nice set of videos <a href="#references">[6]</a> available from the latest Flink Forward conference.</p> +<h4 id="references">References</h4> +<p>[1] <a href="http://flink.apache.org/">http://flink.apache.org/</a></p> +<p>[2] <a href="https://github.com/dbs-leipzig/gradoop">https://github.com/dbs-leipzig/gradoop</a></p> +<p>[3] <a href="https://github.com/s1ck/ldbc-flink-import">https://github.com/s1ck/ldbc-flink-import</a></p> +<p>[4] <a href="https://gist.github.com/s1ck/b33e6a4874c15c35cd16">https://gist.github.com/s1ck/b33e6a4874c15c35cd16</a></p> +<p>[5] <a href="https://ci.apache.org/projects/flink/flink-docs-release-0.10/">https://ci.apache.org/projects/flink/flink-docs-release-0.10/</a></p> +<p>[6] <a href="https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA">https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA</a></p> + + + + + Seventh TUC Meeting + https://ldbcouncil.org/event/seventh-tuc-meeting/ + Mon, 09 Nov 2015 14:17:30 -0400 + + https://ldbcouncil.org/event/seventh-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="http://www.research.ibm.com/labs/watson">IBM&rsquo;s TJ Watson</a> facility on <strong>Monday and Tuesday November 9/10, 2015.</strong></p> +<p>This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify IBM security in advance, registration requests need to be in by Nov 1.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a><br> +- <a href="#date"><strong>Date</strong></a><br> +- <a href="#venue"><strong>Venue</strong></a><br> +- <a href="#maps-and-situation"><strong>Maps and situation</strong></a><br> +- <a href="#getting-there"><strong>Getting there</strong></a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>Monday, 9th of November 2015</strong></p> +<p>8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)</p> +<p>9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)</p> +<p>9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</p> +<p>9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload</p> +<p>10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload</p> +<p>10:30-11:00 Coffee break</p> +<p>11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)</p> +<p>11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.</p> +<p>11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.</p> +<p>12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)</p> +<p>14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox</p> +<p>14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today&rsquo;s architectures</p> +<p>15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G</p> +<p>15:30-16:00 Coffee break</p> +<p>16:00 - 17:00 Technologies (chair Irini Fundulaki)</p> +<p>16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store</p> +<p>16:30 David Ediger (GeorgiaTech). STINGER</p> +<p>17:00 Gary King (Franz Inc.). AllegroGraph&rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties</p> +<p>17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics</p> +<p>18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase</p> +<p>19:00 Social dinner</p> +<p><strong>Tuesday 10th November 2015</strong></p> +<p>9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)</p> +<p>9:00 Philip Rathle (Neo). On openCypher</p> +<p>9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification</p> +<p>9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions</p> +<p>10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation</p> +<p>10:30 - 11:00 Coffee break</p> +<p>11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)</p> +<p>11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL</p> +<p>11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,</p> +<p>11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis</p> +<p>12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>9th and 10th November 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the IBM Thomas J Watson Research Center.<br> +The address is:</p> +<p><strong>IBM Thomas J Watson Research Center</strong><br> +<strong>1101 Kitchawan Rd,</strong><br> +<strong>Yorktown Heights, NY 10598, USA</strong></p> +<p>If you are using a <em>GPS system</em>, please enter <strong>&ldquo;200 Aqueduct Road, Ossining NY, 10562&rdquo;</strong> for accurate directions to the lab entrance. You may also want to check the routing online.</p> +<p>The meeting will take place in the <em>Auditorium</em> on November 9th, and in Meeting Room <em>20-043</em> on November 10th.</p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p>You are highly suggested to <strong>rent a car</strong> for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through <a href="http://www.orbitz.com">www.orbitz.com</a>, or <a href="http://www.expedia.com">www.expedia.com</a> Feel free to email <a href="mailto:yxia@us.ibm.com">yxia@us.ibm.com</a> for any questions.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/seventh-tuc-meeting/attachments/6882333/15926330.png" alt=""></p> +<h6 id="getting-there"><strong>Getting there</strong></h6> +<p><strong>Upper and Eastern New England</strong></p> +<p>Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.</p> +<p><strong>New Haven and Connecticut Shores</strong></p> +<p>Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New Jersey</strong></p> +<p>Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>Upstate New York</strong></p> +<p>Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New York City (Manhattan)</strong></p> +<p>Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>John F. Kennedy International Airport</strong></p> +<p>North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>LaGuardia Airport</strong></p> +<p>East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.</p> +<p><strong>Newark International Airport</strong></p> +<p>North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.</p> +<p><strong>Stewart International Airport</strong></p> +<p>Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.</p> +<p><strong>Westchester County Airport</strong></p> +<p>Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.</p> +<p><strong>Public Transportation</strong></p> +<p>Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.</p> + + + + + Elements of Instance Matching Benchmarks: a Short Overview + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + Tue, 16 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + <p>The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using <strong>instance matching</strong> techniques and tools. Instance matching is also known as <strong>record linkage</strong> <a href="#references">[1]</a>, <strong>duplicate detection</strong> <a href="#references">[2]</a>, <strong>entity resolution</strong> <a href="#references">[3]</a> and <strong>object identification</strong> <a href="#references">[4]</a>.</p> +<p>For instance, a search in Geonames (<a href="http://www.geonames.org/">http://www.geonames.org/</a>) for &ldquo;Athens&rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (<a href="http://dbpedia.org/">http://dbpedia.org/</a>) or Open Government Datasets (<a href="http://data.gov.gr/">http://data.gov.gr/</a>). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.</p> +<p>Web resources are published by &ldquo;autonomous agents&rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years <a href="#references">[2]</a><a href="#references">[5]</a>.</p> +<p>It is though essential at this point to develop, along with instance and entity matching systems, <em>instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs</em>. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.</p> +<p>An instance matching benchmark for Linked Data consists of a <em>source</em> and <em>target dataset</em> implementing a set of <em>test-cases</em>, where each test case addresses a different kind of requirement regarding instance matching, a <em>ground truth</em> or <em>gold standard</em> and finally the <em>evaluation metrics</em> used to <em>assess the benchmark.</em></p> +<p>Datasets are the raw material of a benchmark. A benchmark comprises of a <em>source</em> and <em>target</em> dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their <em>nature</em> (<em>real</em> or <em>synthetic</em>), (b) the <em>schemas/ontologies</em> they use, (c) their <em>domains</em>, (d) the <em>languages</em> they are written in, and (e) the <em>variations/heterogeneities</em> of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. <em>Synthetic datasets</em> are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner</p> +<p>Datasets (and benchmarks) may contain different <em>kinds of variations</em> that correspond to <em>different test cases</em>. According to Ferrara et.al. <a href="#references">[6]</a><a href="#references">[7]</a>, three kinds of variations exist for Linked Data, namely <em>data variations</em>, <em>structural variations</em> and <em>logical variations</em>. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.</p> +<p>The common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.</p> +<p>The <em>gold standard</em> is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.</p> +<p>Last, an instance matching benchmark uses <em>evaluation metrics</em> to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard <em>precision</em>, <em>recall</em> and <em>f-measure</em> metrics.</p> +<h4 id="references">References</h4> +<p>[1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.</p> +<p>[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).</p> +<p>[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.</p> +<p>[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.</p> +<p>[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.</p> +<p>[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).</p> +<p>[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + Wed, 10 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + <p>In this post we will look at running the <a href="https://ldbcouncil.org/developer/snb">LDBC SNB</a> on <a href="https://virtuoso.openlinksw.com/">Virtuoso</a>.</p> +<p>First, let&rsquo;s recap what the benchmark is about:</p> +<ol> +<li> +<p>fairly frequent short updates, with no update contention worth mentioning</p> +</li> +<li> +<p>short random lookups</p> +</li> +<li> +<p>medium complex queries centered around a person&rsquo;s social environment</p> +</li> +</ol> +<p>The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.</p> +<p>The DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, <em>per se,</em> since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.</p> +<p>The workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.</p> +<p>The test system is the same as used in the <a href="http://www.openlinksw.com/weblog/oerling/?id=1739">TPC-H series</a>: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics branch</a> of <a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack, available from www.github.com</a>.</p> +<p>The dataset is the SNB 300G set, with:</p> +<table> +<thead> +<tr> +<th>1,136,127</th> +<th>persons</th> +</tr> +</thead> +<tbody> +<tr> +<td>125,249,604</td> +<td>knows edges</td> +</tr> +<tr> +<td>847,886,644</td> +<td>posts, including replies</td> +</tr> +<tr> +<td>1,145,893,841</td> +<td>tags of posts or replies</td> +</tr> +<tr> +<td>1,140,226,235</td> +<td>likes of posts or replies</td> +</tr> +</tbody> +</table> +<p>As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.</p> +<p>Below are the numerical quantities for a 400K operation run after 150K operations worth of warmup.</p> +<p><strong>Duration:</strong> 10:41.251<br> +<strong>Throughput:</strong> 623.71 (op/s)</p> +<p>The statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.</p> +<table> +<thead> +<tr> +<th>% of total</th> +<th>total_wait</th> +<th>name</th> +<th>count</th> +<th>mean</th> +<th>min</th> +<th>max</th> +</tr> +</thead> +<tbody> +<tr> +<td>20%</td> +<td>4,231,130</td> +<td>LdbcQuery5</td> +<td>656</td> +<td>6,449.89</td> +<td>245</td> +<td>10,311</td> +</tr> +<tr> +<td>11%</td> +<td>2,272,954</td> +<td>LdbcQuery8</td> +<td>18,354</td> +<td>123.84</td> +<td>14</td> +<td>2,240</td> +</tr> +<tr> +<td>10%</td> +<td>2,200,718</td> +<td>LdbcQuery3</td> +<td>388</td> +<td>5,671.95</td> +<td>468</td> +<td>17,368</td> +</tr> +<tr> +<td>7.3%</td> +<td>1,561,382</td> +<td>LdbcQuery14</td> +<td>1,124</td> +<td>1,389.13</td> +<td>4</td> +<td>5,724</td> +</tr> +<tr> +<td>6.7%</td> +<td>1,441,575</td> +<td>LdbcQuery12</td> +<td>1,252</td> +<td>1,151.42</td> +<td>15</td> +<td>3,273</td> +</tr> +<tr> +<td>6.5%</td> +<td>1,396,932</td> +<td>LdbcQuery10</td> +<td>1,252</td> +<td>1,115.76</td> +<td>13</td> +<td>4,743</td> +</tr> +<tr> +<td>5%</td> +<td>1,064,457</td> +<td>LdbcShortQuery3PersonFriends</td> +<td>46,285</td> +<td>22.9979</td> +<td>0</td> +<td>2,287</td> +</tr> +<tr> +<td>4.9%</td> +<td>1,047,536</td> +<td>LdbcShortQuery2PersonPosts</td> +<td>46,285</td> +<td>22.6323</td> +<td>0</td> +<td>2,156</td> +</tr> +<tr> +<td>4.1%</td> +<td>885,102</td> +<td>LdbcQuery6</td> +<td>1,721</td> +<td>514.295</td> +<td>8</td> +<td>5,227</td> +</tr> +<tr> +<td>3.3%</td> +<td>707,901</td> +<td>LdbcQuery1</td> +<td>2,117</td> +<td>334.389</td> +<td>28</td> +<td>3,467</td> +</tr> +<tr> +<td>2.4%</td> +<td>521,738</td> +<td>LdbcQuery4</td> +<td>1,530</td> +<td>341.005</td> +<td>49</td> +<td>2,774</td> +</tr> +<tr> +<td>2.1%</td> +<td>440,197</td> +<td>LdbcShortQuery4MessageContent</td> +<td>46,302</td> +<td>9.50708</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.9%</td> +<td>407,450</td> +<td>LdbcUpdate5AddForumMembership</td> +<td>14,338</td> +<td>28.4175</td> +<td>0</td> +<td>2,008</td> +</tr> +<tr> +<td>1.9%</td> +<td>405,243</td> +<td>LdbcShortQuery7MessageReplies</td> +<td>46,302</td> +<td>8.75217</td> +<td>0</td> +<td>2,112</td> +</tr> +<tr> +<td>1.9%</td> +<td>404,002</td> +<td>LdbcShortQuery6MessageForum</td> +<td>46,302</td> +<td>8.72537</td> +<td>0</td> +<td>1,968</td> +</tr> +<tr> +<td>1.8%</td> +<td>387,044</td> +<td>LdbcUpdate3AddCommentLike</td> +<td>12,659</td> +<td>30.5746</td> +<td>0</td> +<td>2,060</td> +</tr> +<tr> +<td>1.7%</td> +<td>361,290</td> +<td>LdbcShortQuery1PersonProfile</td> +<td>46,285</td> +<td>7.80577</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.6%</td> +<td>334,409</td> +<td>LdbcShortQuery5MessageCreator</td> +<td>46,302</td> +<td>7.22234</td> +<td>0</td> +<td>2,055</td> +</tr> +<tr> +<td>1%</td> +<td>220,740</td> +<td>LdbcQuery2</td> +<td>1,488</td> +<td>148.347</td> +<td>2</td> +<td>2,504</td> +</tr> +<tr> +<td>0.96%</td> +<td>205,910</td> +<td>LdbcQuery7</td> +<td>1,721</td> +<td>119.646</td> +<td>11</td> +<td>2,295</td> +</tr> +<tr> +<td>0.93%</td> +<td>198,971</td> +<td>LdbcUpdate2AddPostLike</td> +<td>5,974</td> +<td>33.3062</td> +<td>0</td> +<td>1,987</td> +</tr> +<tr> +<td>0.88%</td> +<td>189,871</td> +<td>LdbcQuery11</td> +<td>2,294</td> +<td>82.7685</td> +<td>4</td> +<td>2,219</td> +</tr> +<tr> +<td>0.85%</td> +<td>182,964</td> +<td>LdbcQuery13</td> +<td>2,898</td> +<td>63.1346</td> +<td>1</td> +<td>2,201</td> +</tr> +<tr> +<td>0.74%</td> +<td>158,188</td> +<td>LdbcQuery9</td> +<td>78</td> +<td>2,028.05</td> +<td>1,108</td> +<td>4,183</td> +</tr> +<tr> +<td>0.67%</td> +<td>143,457</td> +<td>LdbcUpdate7AddComment</td> +<td>3,986</td> +<td>35.9902</td> +<td>1</td> +<td>1,912</td> +</tr> +<tr> +<td>0.26%</td> +<td>54,947</td> +<td>LdbcUpdate8AddFriendship</td> +<td>571</td> +<td>96.2294</td> +<td>1</td> +<td>988</td> +</tr> +<tr> +<td>0.2%</td> +<td>43,451</td> +<td>LdbcUpdate6AddPost</td> +<td>1,386</td> +<td>31.3499</td> +<td>1</td> +<td>2,060</td> +</tr> +<tr> +<td>0.01%</td> +<td>1,848</td> +<td>LdbcUpdate4AddForum</td> +<td>103</td> +<td>17.9417</td> +<td>1</td> +<td>65</td> +</tr> +<tr> +<td>0.00%</td> +<td>44</td> +<td>LdbcUpdate1AddPerson</td> +<td>2</td> +<td>22</td> +<td>10</td> +<td>34</td> +</tr> +</tbody> +</table> +<p>At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.</p> +<p>The implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.</p> +<p>The set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:</p> +<ul> +<li> +<p><em>Cardinality estimation under heavy data skew —</em> Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.</p> +</li> +<li> +<p><em>Covering indices —</em> Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post&rsquo;s creation date.</p> +</li> +<li> +<p><em>Multi-hop graph traversal —</em> Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.</p> +</li> +<li> +<p><em>Top <em>K</em> —</em> Most queries returning posts order results by descending date. Once there are at least <em>k</em> results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top <em>k</em>.</p> +</li> +<li> +<p><em>Late projection —</em> Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.</p> +</li> +<li> +<p><em>Materialization —</em> Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.</p> +</li> +<li> +<p><em>Concurrency control —</em> Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.</p> +</li> +</ul> +<p>In subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + SNB and Graphs Related Presentations at GRADES '15 + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + Fri, 29 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + <p>Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.</p> +<p>Among the papers published in this edition we have &ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms&rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in <a href="https://github.com/ldbc">https://github.com/ldbc</a>) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have &ldquo;Microblogging Queries on Graph Databases: an Introspection&rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention &ldquo;Frappé: Querying the Linux Kernel Dependency Graph&rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.</p> +<p><a href="http://event.cwi.nl/grades2015/program.shtml">Check the complete agenda.</a></p> +<p>Meet you in Melbourne!</p> + + + + + SNB Interactive Part 2: Modeling Choices + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + Tue, 26 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + <p><a href="https://ldbcouncil.org/benchmarks/snb">​SNB Interactive</a> is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.</p> +<p>In the case of <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server">Virtuoso</a>, we have played with <a href="http://dbpedia.org/resource/SQL">SQL</a> and <a href="http://dbpedia.org/resource/SPARQL">SPARQL</a> implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.</p> +<h3 id="schema-design">Schema Design</h3> +<p>SNB has a regular schema described by a <a href="https://en.wikipedia.org/wiki/Unified_Modeling_Language">UML</a> diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.</p> +<p>The only table-level choice has to do with whether <code>posts</code> and <code>comments</code> are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.</p> +<p>The workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key <code>of ps_creatorid</code>, <code>ps_creationdate</code>, <code>ps_postid</code> pays off since the top-k on <code>creationdate</code> can be pushed down into the index without needing a reference to the table.</p> +<p>The implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the <a href="http://dbpedia.org/resource/DEX_(Graph_database)">Sparksee</a> and <a href="http://dbpedia.org/resource/Neo4j">Neo4J</a> implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.</p> +<p>The benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 &ldquo;top of the wall&rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. <code>person1</code>, <code>person2</code> -&gt; <code>weight</code>. <code>Person1</code> is by convention the one with the smaller <code>p_personid</code>. Note that comparing id&rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI&rsquo;s with disastrous performance implications unless an implementation specific trick were used.</p> +<p>In the next installment we will analyze an actual run.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + Mon, 25 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + <p>LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.</p> +<p>On the industry track, LDBC will be presenting the <em>Social Network Benchmark Interactive Workload</em> by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).</p> +<p>You can read more about the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark here</a> and collaborate if you&rsquo;re interested!</p> +<p>The other presentation will be at the GRADES workshop within the SIGMOD program regarding <em>Graphalytics: A Big Data Benchmark for Graph-Processing platforms</em> by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.</p> +<p>Don&rsquo;t forget to check our presentations if you&rsquo;re attending the SIGMOD!</p> + + + + + SNB Interactive Part 1: What Is SNB Interactive Really About? + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + Thu, 14 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + <p>This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.</p> +<p>With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.</p> +<p>The essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.</p> +<p>So far, we see that SNB confronts the implementor with choices in the following areas:</p> +<ul> +<li>Data model: Relational, RF, property graph?</li> +<li>Physical model, e.g. row-wise vs. column wise storage</li> +<li>Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures</li> +<li>Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers</li> +<li>Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc.</li> +<li>Parameters vs. literals: Sometimes different parameter values result in different optimal query plans</li> +<li>Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload</li> +<li>Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing.</li> +</ul> +<p>In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.</p> +<ul> +<li>Data generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option.</li> +<li>Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce?</li> +<li>Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a &lsquo;fast&rsquo; and &lsquo;slow&rsquo; case of a single query template. How long does one need to run to balance these fluctuations?</li> +<li>Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput.</li> +<li>Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse.</li> +<li>Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable.</li> +<li>Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy?</li> +<li>Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters?</li> +</ul> +<p>The following posts will look at the above in light of actual experience.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + Why Do We Need an LDBC SNB-Specific Workload Driver? + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + Tue, 21 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + <p>In a previous <a href="https://ldbcouncil.org/tags/driver">3-part blog series</a> we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn&rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - &ldquo;graph&rdquo; is the word that best captures the novelty and difficulty of this work.</p> +<p><strong>Workload Execution - Traditional vs Graph</strong></p> +<p>Transactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.</p> +<p>To understand what is meant by &ldquo;traditional relational workloads&rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, &ldquo;dependency&rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator&rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order <em>for a given user</em>. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.</p> +<p>A significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user&rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.</p> +<p>Such scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend&rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. <em>Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.</em></p> +<p><strong>Because it&rsquo;s a graph</strong></p> +<p>In short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.</p> + + + + + Event Driven Post Generation in Datagen + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + Fri, 10 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + <p>As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.</p> +<p>First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:</p> +<ul> +<li> +<p>The wall of the person</p> +</li> +<li> +<p>The albums of the person</p> +</li> +<li> +<p>The groups where the person is a moderator</p> +</li> +</ul> +<p>We will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.</p> +<p>After assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.</p> +<p>Flashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to &ldquo;Enrique Iglesias&rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.</p> +<p>Once the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:</p> +<ul> +<li> +<p>Determine the number of posts to generate</p> +</li> +<li> +<p>Select a random member of the group that will generate the post</p> +</li> +<li> +<p>Determine the event the post will be related to given the aforementioned cumulative distribution</p> +</li> +<li> +<p>Assign the date of the post based on the event date</p> +</li> +</ul> +<p>In order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from <a href="#references">[1]</a>. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.</p> +<p><img src="index.png" alt=""></p> +<p>Following the example of &ldquo;Enrique Iglesias&rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.</p> +<p><img src="index2.png" alt=""></p> +<p>In this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.</p> +<h4 id="references">References</h4> +<p>[1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506</p> + + + + + Sixth TUC Meeting + https://ldbcouncil.org/event/sixth-tuc-meeting/ + Thu, 19 Mar 2015 13:53:33 -0400 + + https://ldbcouncil.org/event/sixth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on <strong>Thursday and Friday March 19/20, 2015.</strong></p> +<p>The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the first benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.</li> +<li>Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>.</p> +<h3 id="agenda">Agenda</h3> +<p><strong>Thursday 19th March</strong></p> +<p>11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)</p> +<p>11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981131.pdf">slides</a></p> +<p>12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)</p> +<p>12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981137.pdf">slides</a></p> +<p>12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain</p> +<p>12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive</p> +<p>13:10 Claudio Martella (VUA): Giraph and Lighthouse</p> +<p>13:30 - 14:30 Lunch break</p> +<p>14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)</p> +<p>14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981132.pdf">slides</a></p> +<p>14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981133.pdf">slides</a></p> +<p>15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981139.pdf">slides</a></p> +<p>15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs</p> +<p>18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.</p> +<p>20:00 Social dinner at <a href="http://www.bastaix.com">Bastaix Restaurant</a>.</p> +<p><strong>Friday 20th March</strong></p> +<p>9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)</p> +<p>9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics</p> +<p>9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms</p> +<p>10:10 John Snelson (MarkLogic): Introduction to MarkLogic</p> +<p>10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload</p> +<p>10:50 Moritz Kaufmann. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/moritz-kaufmann-ldbc-snb-benchmark-auditing-6th-ldbc-tuc.pdf">The auditing experience</a></p> +<p>11:15 - 11:45 Coffee break</p> +<p>11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)</p> +<p>11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox</p> +<p>12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data</p> +<p>12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments</p> +<p>12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981138.pdf">slides</a></p> +<p>13:30 - 14:30 Lunch break</p> +<p>15:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>19th and 20th March 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held at &ldquo;Aula Master&rdquo; at A3 building located inside the &ldquo;Campus Nord UPC&rdquo; in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h5 id="maps-and-situation"><strong>Maps and situation</strong></h5> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<h5 id="finding-upc"><strong>Finding UPC</strong></h5> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<h5 id="finding-the-meeting-room"><strong>Finding the meeting room</strong></h5> +<h5 id="getting-there">Getting there</h5> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to<br> +the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<h5 id="the-locations-of-the-airport-and-the-city-centre"><strong>The locations of the airport and the city centre</strong></h5> + + + + + The LDBC Datagen Community Structure + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + Sun, 15 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + <p>This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.</p> +<p>When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.</p> +<p>The first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 <a href="#references">[1]</a>. Here we summarize the paper and its contributions and findings.</p> +<p>Existing synthetic graph generators such as Rmat <a href="#references">[1]</a> and Mag <a href="#references">[2]</a>, are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR <a href="#references">[3]</a>, a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.</p> +<p>However, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of <a href="#references">[1]</a> set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to <a href="#references">[4]</a>.</p> +<table> +<thead> +<tr> +<th></th> +<th><em>Nodes</em></th> +<th><em>Edges</em></th> +</tr> +</thead> +<tbody> +<tr> +<td><em>Amazon</em></td> +<td>334863</td> +<td>925872</td> +</tr> +<tr> +<td><em>Dblp</em></td> +<td>317080</td> +<td>1049866</td> +</tr> +<tr> +<td><em>Youtube</em></td> +<td>1134890</td> +<td>2987624</td> +</tr> +<tr> +<td><em>Livejournal</em></td> +<td>3997962</td> +<td>34681189</td> +</tr> +</tbody> +</table> +<p>The authors of <a href="#references">[1]</a> selected a set of statistical indicators to<br> +characterize the communities:</p> +<ul> +<li>The clustering coefficient</li> +<li>The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community.</li> +<li>The bridge ratio, which is the ratio of edges whose removal disconnects the community.</li> +<li>The diameter</li> +<li>The conductance</li> +<li>The size</li> +</ul> +<p>The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.</p> +<ul> +<li>Many of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community.</li> +<li>Most of the communities are small (10 or less nodes).</li> +<li>In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index.png" alt=""></td> +<td style="text-align:center"><img src="index2.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index3.png" alt=""></td> +<td style="text-align:center"><img src="index4.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">Diameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index5.png" alt=""></td> +<td style="text-align:center"><img src="index6.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index8.png" alt=""></td> +<td style="text-align:center"><img src="index9.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index10.png" alt=""></td> +<td style="text-align:center"><img src="index11.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index11.png" alt=""></td> +<td style="text-align:center"><img src="index12.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The main conclusions that can be extracted from DATAGEN can be summarized asfollows:</p> +<ul> +<li>DATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio.</li> +<li>The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs.</li> +<li>Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger.</li> +<li>The diameter is small like in the real graphs.</li> +<li>It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics.</li> +</ul> +<p>Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:</p> +<ul> +<li>LFR graphs donot show the multimodal distribution observed in real graphs</li> +<li>Only the diameter shows a similar shape as in the ground truth.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index13.png" alt=""></td> +<td style="text-align:center"><img src="index14.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index15.png" alt=""></td> +<td style="text-align:center"><img src="index16.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index17.png" alt=""></td> +<td style="text-align:center"><img src="index18.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman&rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index19.png" alt=""></td> +<td style="text-align:center"><img src="index20.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index21.png" alt=""></td> +<td style="text-align:center"><img src="index22.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index23.png" alt=""></td> +<td style="text-align:center"><img src="index24.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!</p> +<h4 id="references">References</h4> +<p>[1] Arnau Prat-Pérez, <a href="http://dblp.uni-trier.de/pers/hd/d/Dom=iacute=nguez=Sal:David">David Domínguez-Sal</a>: How community-like is the structure of synthetically generated graphs? <a href="http://dblp.uni-trier.de/db/conf/sigmod/grades2014.html#PratD14">GRADES 2014</a></p> +<p>[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014</p> +<p>[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics</p> +<p>[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.</p> + + + + + Industry Relevance of the Semantic Publishing Benchmark + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + Tue, 03 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + <h3 id="publishing-and-media-businesses-are-going-through-transformation">Publishing and media businesses are going through transformation</h3> +<p>I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.</p> +<p>What’s driving this change in publishing and media?</p> +<ul> +<li> +<p>Widespread and instantaneous distribution of information over the Internet has turned news into somewhat of a &ldquo;commodity&rdquo; and few people are willing to pay for it</p> +</li> +<li> +<p>The wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;</p> +</li> +<li> +<p>Open access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.</p> +</li> +</ul> +<p><em>Alongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.</em></p> +<h3 id="imagine-instant-news-in-context-imagine-personal-channels-imagine--triplestores">Imagine instant news in context, Imagine personal channels, Imagine &hellip; triplestores</h3> +<p>While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in &ldquo;near real time&rdquo; is difficult and expensive using legacy processes and content management technology.</p> +<p>Another example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like <a href="http://new.dowjones.com/products/factiva/">Factiva</a>, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.</p> +<p>Finally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.</p> +<p>Many publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:</p> +<ol> +<li>Generation of rich and &ldquo;meaningful&rdquo; (trying not to use &ldquo;semantic&rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery.</li> +</ol> +<p>In this post I write about &ldquo;semantic annotation&rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.</p> +<h3 id="semantic-annotation-produces-rich-metadata-descriptions--the-fuel-for-semantic-publishing">Semantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing</h3> +<p>The most popular meaning of &ldquo;semantic annotation&rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.</p> +<p><img src="02_semantic_repository.png" alt=""></p> +<p>The concept of using <a href="http://infosys3.elfak.ni.ac.rs/nastava/attach/SemantickiWebKurs/sdarticle.pdf">text-mining for automatic semantic annotation</a> of text with respect to very large datasets, such as <a href="http://dbpedia.org/">DBPedia</a>, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether &ldquo;Paris&rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether &ldquo;Hilton&rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.</p> +<p>Today there are plenty of tools (such as the <a href="https://www.ontotext.com/semantic-solutions/media-publishing/">Ontotext Media and Publishing</a> platform and <a href="https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki">DBPedia Spotlight</a>) and services (such as Thomson Reuter’s <a href="http://www.opencalais.com/">OpenCalais</a> and Ontotext’s <a href="http://s4.ontotext.com">S4</a>) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.</p> +<h3 id="the-bbcs-dynamic-semantic-publishing-dsp">The BBC’s Dynamic Semantic Publishing (DSP)</h3> +<p>Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.</p> +<p>BBC Future Media &amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.</p> +<p><em>&ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories&rdquo;.</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">Jem Rayfield, Senior Technical Architect</a>, BBC News and Knowledge</p> +<p>The Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (&ldquo;edited by exception&rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.</p> +<p><em>&ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#language">natural language and ontological determiner process</a> automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.</em></p> +<p><img src="03_bbc_sport.png" alt=""></p> +<p><em>Journalist-published metadata is captured and made persistent for querying using the resource description framework (<a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#RDF"><em>RDF</em></a>) metadata representation and triple store technology. <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#BigOWLIM">A RDF triplestore</a> and <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#SPARQL">SPARQL</a> approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept &ldquo;Frank Lampard&rdquo;, then the framework infers and applies concepts such as &ldquo;England Squad&rdquo;, &ldquo;Group C&rdquo; and &ldquo;FIFA World Cup 2010&rdquo; &hellip;&rdquo;</em> &ndash; Jem Rayfield</p> +<p>One can consider each of the &ldquo;aggregation pages&rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.</p> +<p><em>&ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content</em></p> +<p><strong>…</strong><strong><em>we are not publishing pages, but publishing content</em></strong> <em>as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.</em></p> +<p><img src="04_content_tagging.png" alt=""></p> +<p><em>… The index pages are published automatically. This process is what assures us of the highest quality output, but still <strong>save large amounts of time</strong> in managing the site and <strong>makes it possible for us to efficiently run so many pages</strong> for the World Cup.&rdquo;</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/the_world_cup_and_a_call_to_ac.html">John O&rsquo;Donovan, Chief Technical Architect, BBC Future Media &amp; Technology</a></p> +<p>To get a real feeling about the load of the triplestore behind BBC&rsquo;s World Cup web site, here are some statistics:</p> +<ul> +<li> +<p>800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;</p> +</li> +<li> +<p>Average unique page requests/day: 2 million;</p> +</li> +<li> +<p>Average <strong>SPARQL queries/day: 1 million;</strong></p> +</li> +<li> +<p><strong>100s repository updates/inserts per minute</strong> with OWL 2 RL reasoning;</p> +</li> +<li> +<p>Multi data center that is fully resilient, clustered 6 node triplestore.</p> +</li> +</ul> +<h3 id="the-semantic-publishing-benchmark">The Semantic Publishing Benchmark</h3> +<p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).</p> +<p>SPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:</p> +<ul> +<li> +<p><em>Reference knowledge:</em> taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);</p> +</li> +<li> +<p><em>Metadata</em> for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.</p> +</li> +</ul> +<p>In this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:</p> +<ul> +<li> +<p><em>Aggregation queries</em> retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;</p> +</li> +<li> +<p><em>Updates</em>, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.</p> +</li> +</ul> +<p>SPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from <a href="http://www.geonames.org/">Geonames</a> for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.</p> +<p>A more technical introduction to SPB can be found in this <a href="https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark">post</a>. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">post</a>. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.</p> +<p>Despite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of &ldquo;fast flowing&rdquo; content need to be &ldquo;dispatched&rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:</p> +<ul> +<li> +<p>The Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;</p> +</li> +<li> +<p>Reasoning is needed to map content descriptions to queries in a flexible manner;</p> +</li> +<li> +<p>There are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.</p> +</li> +</ul> +<h3 id="spb-v20--steeper-for-the-engines-closer-to-the-publishers">SPB v.2.0 – steeper for the engines, closer to the publishers</h3> +<p>We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.</p> +<p>The major changes in SPB v.2.0 can be summarized as follows:</p> +<ul> +<li> +<p>Much bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;</p> +</li> +<li> +<p>Better interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;</p> +</li> +<li> +<p>Retrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.</p> +</li> +</ul> + + + + + OWL-Empowered SPARQL Query Optimization + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + Wed, 18 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + <p>The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.</p> +<p>In this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.</p> +<p>OWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.</p> +<p>This richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.</p> +<p>The aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found <a href="LDBC_D4.4.2_final.pdf">here</a>.</p> +<h3 id="schema-based-optimization-techniques">Schema-Based Optimization Techniques</h3> +<p>Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.</p> +<p>A simple first case is the case of constraint violation. Consider the query below, which returns all instances of class <code>&lt;A&gt;</code> which are fillers of a specific property <code>&lt;P&gt;</code>. If the underlying schema contains the information that the range of <code>&lt;P&gt;</code> is class <code>&lt;B&gt;</code>, and that class <code>&lt;B&gt;</code> is disjoint from class <code>&lt;A&gt;</code>, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.</p> +<pre tabindex="0"><code>SELECT ?v +WHERE { ?v rdf : type &lt;A&gt; . + ?u &lt;P&gt; ?v . ?u &lt;P&gt; ?v1 . + ?u &lt;P1 &gt; ?v2 . ?u &lt;P2 &gt; ?v3 . + ?u &lt;P3 &gt; ?v4 . ?u &lt;P4 &gt; ?v5} +</code></pre><p>Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class <code>&lt;A1&gt;</code> can immediately be identified as not being in the answer set.</p> +<pre tabindex="0"><code>SELECT ?c +WHERE { ?x rdf: type ?c . ?x &lt;P&gt; ?y . + FILTER NOT EXISTS \{ ?x rdf: type &lt;A1 &gt; }} +</code></pre><p>Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.</p> +<p>Similarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).</p> +<p>As an example of these patterns, consider the query below, where class <code>&lt;C&gt;</code> is defined as the intersection of classes <code>&lt;C1&gt;</code>,<code> &lt;C2&gt;</code>. Thus, the triple pattern <code>(?x rdf:type &lt;C&gt;)</code> is more selective than <code>(?y rdf:type &lt;C1&gt;)</code> and <code>(?z rdf:type &lt;C2&gt;)</code> and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.</p> +<pre tabindex="0"><code>SELECT ?x +WHERE { ?x rdf: type &lt;C&gt; . ?x &lt;P1 &gt; ?y . + ?y rdf : type &lt;C1 &gt; . ?y &lt;P2 &gt; ?z . ?z rdf : type &lt;C2 &gt; } +</code></pre><p><img src="owl_constraints.png" alt="image"></p> +<p>Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property <code>P4</code> is a symmetric property.</p> +<pre tabindex="0"><code>SELECT ?y ?y1 ?y2 ?y3 +WHERE { ?x &lt;P1 &gt; ?y . ?x &lt;P2 &gt; ?y1 . + ?x &lt;P3 &gt; ?y2 . ?y3 &lt;P4 &gt; ?x } +</code></pre><h3 id="conclusion">Conclusion</h3> +<p>In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:</p> +<ul> +<li>Cases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results.</li> +<li>Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing.</li> +<li>Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query.</li> +<li>Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques.</li> +</ul> +<p>This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.</p> + + + + + Person Activity Subgraph Features in LDBC DATAGEN + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + Wed, 04 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + <p>When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.</p> +<p>When looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.</p> +<h3 id="reaslistic-message-content">Reaslistic Message Content</h3> +<p>Messages&rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post&rsquo;s tags to other related or randomly selected tags.</p> +<h3 id="non-uniform-activity-levels">Non uniform activity levels</h3> +<p>In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.</p> +<h3 id="time-correlated-post-and-comment-generation">Time correlated post and comment generation</h3> +<p>In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in <a href="#references">[1]</a>. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.</p> +<p><img src="1.png" alt="image"></p> +<p>As we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.</p> +<p>In this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.</p> +<h4 id="references">References</h4> +<p>[1] Leskovec, J., Backstrom, L., &amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In <em>Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 497-506). ACM.</p> + + + + + SNB Driver - Part 2: Tracking Dependencies Between Queries + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + Fri, 23 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + <p>The <a href="https://ldbcouncil.org/post/snb-driver-part-1">SNB Driver part 1</a> post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we&rsquo;ll drill down deeper into the details of what it means to execute &ldquo;dependent queries&rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.</p> +<h3 id="definitions">Definitions</h3> +<ul> +<li> +<p><em>Simulation Time (ST)</em>: notion of time created by data generator. All time stamps in the generated data set are in simulation time</p> +</li> +<li> +<p><em>Real Time (RT)</em>: wall clock time</p> +</li> +<li> +<p><em>Time Compression Ratio</em>: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark</p> +</li> +<li> +<p><em>Operation</em>: read and/or write</p> +</li> +<li> +<p><em>Dependencies</em>: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed</p> +</li> +<li> +<p><em>Dependents</em>: operations in this set are dependent on at least one other operation (in Dependencies) in the workload</p> +</li> +<li> +<p><em>Due Time (DueT)</em>: point in simulation time at which the execution of an operation should be initiated.</p> +</li> +<li> +<p><em>Dependent Time (DepT)</em>: in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.</p> +</li> +<li> +<p><em>Safe Time (SafeT)</em>: time duration.</p> +<ul> +<li> +<p>when two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them</p> +</li> +<li> +<p>SafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents</p> +</li> +</ul> +</li> +<li> +<p>​<em>Operation Stream</em>: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)</p> +</li> +<li> +<p><em>Initiated Operations</em>: operations that have started executing but not yet finished</p> +</li> +<li> +<p><em>Local Completion Time (per driver)</em>: point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))</p> +</li> +<li> +<p><em>Global Completion Time (GCT)</em>: minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​</p> +</li> +<li> +<p><em>Execution Window (Window)</em>: a timespan within which all operations can be safely executed</p> +<ul> +<li> +<p>All operations satisfying window.startTime &lt;= operation.DueT &lt; window.endTime may be executed</p> +</li> +<li> +<p>Within a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window</p> +</li> +<li> +<p>To ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 &lt; window.duration &lt;= SafeT</p> +</li> +<li> +<p>Window duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable</p> +</li> +<li> +<p>Before any operations within a window can start executing it is required that: GCT &gt;= window.startTime - (SafeT - window.duration)</p> +</li> +<li> +<p>All operations within a window must initiate and complete between window start and end times: window.startTime &lt;= operation.initiate &lt; window.endTime and window.startTime &lt;= operation.complete &lt; window.endTime</p> +</li> +</ul> +</li> +<li> +<p><em>Dependency Mode</em>: defines dependencies, constraints on operation execution order</p> +</li> +<li> +<p><em>Execution Mode</em>: defines how the runtime should execute operations of a given type</p> +</li> +</ul> +<h3 id="tracking-dependencies">Tracking Dependencies</h3> +<p>Now, the fun part, making sure dependent operations are executed in the correct order.</p> +<p>Consider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).</p> +<p>Logically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:</p> +<ul> +<li>the set of operations that have started executing but not yet finished.</li> +</ul> +<p>Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:</p> +<ul> +<li>the set of operations that have started and finished executing.</li> +</ul> +<p>Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):</p> +<ul> +<li>the point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time.</li> +</ul> +<p>LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:</p> +<ul> +<li> +<p><em>Due Time</em>: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled</p> +</li> +<li> +<p><em>GCT</em>: every operation (from Dependencies) with a Due Time before this point in time has completed execution</p> +</li> +</ul> +<p>However, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:</p> +<ul> +<li>in addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute.</li> +</ul> +<p>Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.</p> +<h3 id="scalable-execution-in-the-presence-of-dependencies">Scalable execution in the Presence of Dependencies</h3> +<p>The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:</p> +<p>a) make the generated load less &lsquo;bursty&rsquo;</p> +<p>b) allow the driver to &lsquo;scale&rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.</p> +<p>In the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.</p> +<p>The reasons this approach is correct are as follows:</p> +<ul> +<li> +<p>Operations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked</p> +</li> +<li> +<p>The minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)</p> +</li> +<li> +<p>A window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until</p> +<p>GCT &gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT</p> +</li> +</ul> +<p>The advantages of such an execution mode are as follows:</p> +<ul> +<li> +<p>As no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window</p> +</li> +<li> +<p>Then, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic</p> +</li> +<li> +<p>Further, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to &lsquo;smoothen&rsquo; the load within a Window.</p> +</li> +</ul> +<p>As with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between &lsquo;workload resolution&rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the &lsquo;bursty&rsquo; part of the event stream.</p> +<p><img src="window-scheduling.png" alt="image"><br> +Figure 1. Window scheduling</p> +<p>This design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - &lsquo;what happens in the window stays in the window&rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.</p> + + + + + SNB Driver - Part 3: Workload Execution Putting It All Together + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + Tue, 20 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + <p>Up until now we have introduced the <a href="https://ldbcouncil.org/post/snb-driver-part-1">challenges faced when executing the LDBC SNB benchmark</a>, as well as explained <a href="https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries">how some of these are overcome</a>. With the foundations laid, we can now explain precisely how operations are executed.</p> +<p>Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.</p> +<h3 id="dependency-modes">Dependency Modes</h3> +<p>While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.</p> +<p>Another way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:</p> +<p><strong>• None</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>– Prior Execution: do nothing – After Execution: do nothing</p> +<p><strong>• Read Only</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: wait for GCT &gt;= operation.DepTime – After Execution: do nothing</p> +<p><strong>• Write Only</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<p><strong>• Read Write</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations, wait for GCT &lt; operation.DepT</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<h3 id="execution-modes">Execution Modes</h3> +<p>Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.</p> +<p><strong>• Asynchronous</strong>: operations are executed individually, when their Due Time arrives.</p> +<p>Motivation: This is the default execution mode, it executes operations as true to the workload definition as possible.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: unbounded</p> +<p>– Max Execution Time: unbounded</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay</p> +<p><strong>• Synchronous</strong>: operations are executed individually, sequentially, in blocking manner.</p> +<p>Motivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.</p> +<p>The alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler</p> +<p>– Execute When time &gt;= operation.DueT and previousOperation.completed == true (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: 1</p> +<p>– Max Execution Time: nextOperation.DueT - operation.DueT</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay</p> +<p><strong>• Partially Synchronous</strong> (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together</p> +<p>– Re-scheduling Before Execution: Yes, as long as the following still holds:</p> +<p>window.startTime &lt;= operation.DueT &lt; window.startTime + window.duration</p> +<p>Operations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified</p> +<p>– Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: number of operations within window</p> +<p>– Max Execution Time: (window.startTime + window.duration) - operation.DueT</p> +<p>– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration</p> +<h3 id="tying-it-back-to-ldbc-snb">Tying it back to LDBC SNB</h3> +<p>The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.</p> +<p>The LDBC SNB data can in fact be seen as a union of two parts:</p> +<ol> +<li> +<p>Core Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.</p> +</li> +<li> +<p>User Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the &ldquo;core&rdquo; part are satisfied (i.e., users don&rsquo;t post things before the profiles are created, etc.).</p> +</li> +</ol> +<p>In order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.</p> +<p>This property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.</p> +<p>On the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).</p> + + + + + Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + Tue, 13 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + <p>Until now we have discussed several aspects of the <a href="https://ldbcouncil.org/benchmarks/spb">Semantic Publishing Benchmark (SPB)</a> such as the <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">difference in performance between virtual and real servers configuration</a>, how to choose an <a href="https://ldbcouncil.org/post/making-semantic-publishing-execution-rules">appropriate query mix</a> for a benchmark run and our experience with using SPB in the development process of GraphDB for <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">finding performance issues</a>.</p> +<p>In this post we provide a step-by-step guide on how to run SPB using the <a href="http://rdf4j.org/">Sesame</a> RDF data store on a fresh install of <a href="http://releases.ubuntu.com/14.04.1/">Ubuntu Server 14.04.1</a>. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.</p> +<h3 id="prerequisites">Prerequisites</h3> +<p>We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:</p> +<ul> +<li>Git</li> +<li>Apache Ant 1.8 or higher</li> +<li>OpenJDK 6 or Oracle JDK 6 or higher</li> +<li>Apache Tomcat 7 or higher</li> +</ul> +<p>If you already have these components installed on your machine you can directly proceed to the next section: <em>Installing Sesame</em></p> +<p>Following are sample commands which can be used to install the required software components:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo apt-get install git +</span></span><span style="display:flex;"><span>sudo apt-get install ant +</span></span><span style="display:flex;"><span>sudo apt-get install default-jdk +</span></span><span style="display:flex;"><span>sudo apt-get install tomcat7 +</span></span></code></pre></div><p>Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.</p> +<p>After a successful installation of Apache Tomcat you should be able to get the default splash page <em>“It works”</em> when you open your web browser and enter the following address: http://&lt;your_ip_address&gt;:8080</p> +<h3 id="installing-sesame">Installing Sesame</h3> +<p>We will use current Sesame version 2.7.14. You can download it <a href="http://sourceforge.net/projects/sesame/files/Sesame%202/">here</a> or run following command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>wget <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download&#34;</span> <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> -O openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>Then extract the Sesame tarball:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>tar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>To deploy sesame you have to copy the two war files that are in <em>openrdf-sesame-2.7.14/war</em> to <em>/var/lib/tomcat7/webapps</em></p> +<p>From <em>openrdf-sesame-2.7.14/war</em> you can do it with command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>cp openrdf-*.war &lt;tomcat_install&gt;/webapps +</span></span></code></pre></div><p>Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.</p> +<p>By default the configuration directory is: <em>/usr/share/tomcat7/.aduna</em></p> +<p>Create the directory:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo mkdir /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Then change the ownership:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chown tomcat7 /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>And finally you should give the necessary permissions:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chmod o+rwx /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Now when you go to: http://&lt;your_ip_address&gt;:8080/openrdf-workbench/repositories</p> +<p>You should get a screen like this:</p> +<p><img src="01-Sesame-repo-list.png" alt="image"></p> +<h3 id="setup-spb">Setup SPB</h3> +<p>You can download the SPB code and find brief documentation on GitHub:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm">https://github.com/ldbc/ldbc_spb_bm</a></p> +<p>A detailed documentation is located here:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf">https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf</a></p> +<p>SPB offers many configuration options which control various features of the benchmark e.g.:</p> +<ul> +<li>query mixes</li> +<li>dataset size</li> +<li>loading datasets</li> +<li>number of agents</li> +<li>validating results</li> +<li>test conformance to OWL2-RL ruleset</li> +<li>update rate of agents</li> +</ul> +<p>Here we demonstrate how to generate a dataset and execute a simple test<br> +run with it.</p> +<p>First download the SPB source code from the repository:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>git clone https://github.com/ldbc/ldbc_spb_bm.git +</span></span></code></pre></div><p>Then in the ldbc_spb_bm directory build the project:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>ant build-basic-querymix +</span></span></code></pre></div><p>If you simply execute the command:</p> +<pre tabindex="0"><code>ant +</code></pre><p>you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.</p> +<p>Depending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat&rsquo;s startup files e.g. in <em>catalina.sh</em>:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>export JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;-d64 -Xmx4G&#34;</span> +</span></span></code></pre></div><p>To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:</p> +<p><img src="02-Sesame-create-repo.png" alt="image"></p> +<p>Then we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in <em>ldbc_spb_bm/dist/test.properties</em> file.</p> +<p>The default value of <em>datasetSize</em> in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.</p> +<p>You need to change</p> +<pre tabindex="0"><code>datasetSize=1000000 +</code></pre><p>Also the URLs of the SPARQL endpoint for the repository</p> +<pre tabindex="0"><code>endpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 +endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements +</code></pre><p>First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.</p> +<p>These are the settings to do that, following parameters will &lsquo;instruct&rsquo; the SPB test driver to perform all the actions described above:</p> +<pre tabindex="0"><code>#Benchmark Operational Phases +loadOntologies=true +loadReferenceDatasets=true +generateCreativeWorks=true +loadCreativeWorks=true +generateQuerySubstitutionParameters=true +validateQueryResults=false +warmUp=false +runBenchmark=false +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>To run the benchmark execute the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>java -jar semantic_publishing_benchmark-basic-standard.jar +</span></span><span style="display:flex;"><span>test.properties +</span></span></code></pre></div><p>When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.</p> +<p>Next we will measure the performance of Sesame Data Store by changing some configuration properties:</p> +<pre tabindex="0"><code>#Benchmark Configuration Parameters +warmupPeriodSeconds=60 +benchmarkRunPeriodSeconds=300 +... +#Benchmark Operational Phases +loadOntologies=false +loadReferenceDatasets=false +generateCreativeWorks=false +loadCreativeWorks=false +generateQuerySubstitutionParameters=false +validateQueryResults=false +warmUp=true +runBenchmark=true +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>After the benchmark test run has finished result files are saved in folder: <em>dist/logs</em></p> +<p>There you will find three types of results: the result summary of the benchmark run (<em>semantic_publishing_benchmark_results.log),</em> brief results and detailed results.</p> +<p>In <em>semantic_publishing_benchmark_results.log</em> you will find the results distributed per seconds. They should be similar to the listing bellow:</p> +<p>Benchmark Results for the 300-th second</p> +<pre tabindex="0"><code>Seconds : 300 (completed query mixes : 0) + Editorial: + 2 agents + + 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) + 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) + 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) + + 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) + 0.0300 average operations per second + + Aggregation: + 8 agents + + 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) + 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) + 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) + 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) + 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) + 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) + 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) + 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) + 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) +</code></pre><p>This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.</p> +<p>If you have any troubles running the benchmark, don&rsquo;t hesitate to comment or use our social media channels.</p> +<p>In a future post we will go through some of the parameters of SPB and check their performance implications.</p> + + + + + Semantic Publishing Instance Matching Benchmark + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + Tue, 30 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + <p>The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.</p> +<p>The SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:</p> +<ul> +<li>value-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity</li> +<li>structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation)</li> +</ul> +<p>The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:</p> +<ul> +<li>instance (in)equality (owl:sameAs, owl:differentFrom)</li> +<li>class and property equivalence (owl:equivalentClass, owl:equivalentProperty)</li> +<li>class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties)</li> +<li>class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf)</li> +<li>property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty)</li> +<li>complex class definitions (owl:unionOf, owl:intersectionOf)</li> +</ul> +<p>SPIMBench uses and extends the ontologies of LDBC&rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB&rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.</p> +<p>Value and structure-based test cases are implemented using the SWING framework <a href="#references">[1]</a> on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.</p> +<p>SPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.</p> +<p>SPIMBench can be downloaded from <a href="https://github.com/jsaveta/SPIMBench">our repository</a> and a more thorough description thereof can be found on <a href="http://www.ics.forth.gr/isl/spimbench/">http://www.ics.forth.gr/isl/spimbench/</a>.</p> +<h4 id="references">References</h4> +<p>[1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + Further Developments in SNB BI Workload + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + Thu, 18 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + <p>We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.</p> +<p>As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.</p> +<p>There are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.</p> +<p>Let’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.</p> +<p>One of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.</p> +<p>Another feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.</p> +<p>Since a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.</p> +<p>An orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.</p> +<p>So, let’s see how some of these aspects could be captured in the SNB context.</p> +<p>Geography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.</p> +<p>The communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -&gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.</p> +<p>The behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.</p> +<p>In John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”</p> +<p>Analytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.</p> +<p>From a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.</p> +<p>This leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.</p> +<p>There is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.</p> + + + + + Sizing AWS Instances for the Semantic Publishing Benchmark + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + Wed, 17 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + <p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">BBC Dynamic Semantic Publishing</a> scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">wrote earlier</a>, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.</p> +<p>Lately we tested different Amazon Web Services (<a href="https://aws.amazon.com/">AWS</a>) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that <a href="https://www.ontotext.com/products/ontotext-graphdb/">GraphDB</a> experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.</p> +<h3 id="the-experiment">The Experiment</h3> +<p>For our tests we use:</p> +<ul> +<li>GraphDB Standard v6.1</li> +<li>LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: +<ul> +<li>8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also</li> +<li>50M dataset (SF1)</li> +<li>40 minutes of benchmark run time (60 seconds of warm up)</li> +</ul> +</li> +<li>5 different Amazon EC2 instances and one local server</li> +</ul> +<p>Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.</p> +<p>We use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.</p> +<p>We also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.</p> +<h3 id="the-results">The Results</h3> +<p>For the tests we measured:</p> +<ul> +<li><em>queries/s</em> for the read threads, where queries include SELECT and CONSTRUCT</li> +<li><em>updates/s</em> for the write threads, where an update operation is INSERT or DELETE</li> +<li><em>queries/$</em> and <em>updates/$</em> – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput</li> +<li><em>update/vCPU</em> – modification operations per vCPU per second</li> +</ul> +<p>Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.</p> +<p>The improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.</p> +<p>Table 1. SPB Measurement Results on AWS and Local Servers</p> +<table> +<thead> +<tr> +<th>Server Type</th> +<th>vCPUs</th> +<th>R/W Agents</th> +<th>RAM (GB)</th> +<th>&ldquo;Storage (GB, SSD)&rdquo;</th> +<th>Price USD/h</th> +<th>Queries/ sec.</th> +<th>Updates/ sec.</th> +<th>Queries/ USD</th> +<th>Updates/ USD</th> +<th>Updates/ vCPU</th> +</tr> +</thead> +<tbody> +<tr> +<td>m3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>15</td> +<td>2x 40</td> +<td>0.28</td> +<td>8.39</td> +<td>8.23</td> +<td>107 882</td> +<td>105 873</td> +<td>2.06</td> +</tr> +<tr> +<td>m3.2xlarge</td> +<td>8</td> +<td>8/2</td> +<td>30</td> +<td>2x 80</td> +<td>0.56</td> +<td>15.44</td> +<td>15.67</td> +<td>99 282</td> +<td>100 752</td> +<td>1.96</td> +</tr> +<tr> +<td>c3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>7.5</td> +<td>2x 40</td> +<td>0.21</td> +<td>7.17</td> +<td>6.78</td> +<td>122 890</td> +<td>116 292</td> +<td>1.7</td> +</tr> +<tr> +<td><strong>c3.2xlarge</strong></td> +<td><strong>8</strong></td> +<td><strong>8/2</strong></td> +<td><strong>15</strong></td> +<td><strong>2x 80</strong></td> +<td><strong>0.42</strong></td> +<td><strong>16.46</strong></td> +<td><strong>14.56</strong></td> +<td><strong>141 107</strong></td> +<td><strong>124 839</strong></td> +<td><strong>1.82</strong></td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>8/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>23.23</strong></td> +<td><strong>21.17</strong></td> +<td><strong>99 578</strong></td> +<td><strong>90 736</strong></td> +<td><strong>1.32</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>8/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>22.89</td> +<td>20.39</td> +<td>98 100</td> +<td>87 386</td> +<td>1.27</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/2</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.6</td> +<td>19.11</td> +<td>114 000</td> +<td>81 900</td> +<td>1.19</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.19</td> +<td>19.18</td> +<td>112 243</td> +<td>82 200</td> +<td>1.2</td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>14/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>30.84</strong></td> +<td><strong>16.88</strong></td> +<td><strong>132 171</strong></td> +<td><strong>72 343</strong></td> +<td><strong>1.06</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>14/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>29.67</td> +<td>17.8</td> +<td>127 157</td> +<td>76 286</td> +<td>1.11</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.11</td> +<td>32.04</td> +<td>156 712</td> +<td>135 302</td> +<td>1</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.31</td> +<td>32.07</td> +<td>157 557</td> +<td>135 429</td> +<td>1</td> +</tr> +<tr> +<td><strong>Local</strong></td> +<td><strong>32</strong></td> +<td><strong>10/2</strong></td> +<td><strong>256</strong></td> +<td><strong>8x 256</strong></td> +<td><strong>0.85</strong></td> +<td><strong>40</strong></td> +<td><strong>31.01</strong></td> +<td><strong>168 916</strong></td> +<td><strong>130 952</strong></td> +<td><strong>0.97</strong></td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.39</td> +<td>26.42</td> +<td>153 672</td> +<td>111 569</td> +<td>0.83</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.22</td> +<td>26.39</td> +<td>152 954</td> +<td>111 443</td> +<td>0.82</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>20/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>34.59</td> +<td>23.86</td> +<td>146 070</td> +<td>100 759</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<h3 id="the-optimal-number-of-test-agents">The Optimal Number of Test Agents</h3> +<p>Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:</p> +<ul> +<li>There is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads;</li> +<li>Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17);</li> +<li>Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations;</li> +<li>For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server;</li> +<li>Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization.</li> +<li>There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates.</li> +</ul> +<h3 id="efficiency-and-cost">Efficiency and Cost</h3> +<p>AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.</p> +<p>Cloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.</p> +<p>$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!</p> +<p>The full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.</p> +<p>To put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:</p> +<ul> +<li><strong>100 queries/sec.</strong> handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events.</li> +<li><strong>10 updates/sec.</strong> - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour.</li> +<li><strong>$81/day</strong> is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times.</li> +</ul> + + + + + DATAGEN: a Realistic Social Network Data Generator + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + Sat, 06 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/getting-started-with-snb">Getting started with snb</a>, <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">DATAGEN: data generation for the Social Network Benchmark</a>), Arnau Prat discussed the main features and characteristics of DATAGEN: <em>realism</em>, <em>scalability</em>, <em>determinism</em>, <em>usability</em>. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the <a href="https://github.com/ldbc/ldbc_snb_datagen">instructions for generating a SNB dataset</a> and <a href="https://github.com/ldbc/ldbc_snb_implementations/tree/master/interactive/virtuoso">for loading the dataset into Virtuoso</a>. In the following sections, we analyze several aspects of the generated dataset.</p> +<h3 id="a-realistic-social-graph">A Realistic social graph</h3> +<p>One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their <em><knows></em> relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about <a href="https://www.facebook.com/notes/facebook-data-team/anatomy-of-facebook/10150388519243859">Facebook Anatomy</a>). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook&rsquo;s graph distribution.</p> +<p><img src="Cumulative-distribution.png" alt="image"> <br> +Figure 1: Cumulative distribution #friends per user</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-r" data-lang="r"><span style="display:flex;"><span><span style="color:#75715e">#R script for generating the social degree distribution </span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">#Input files: person_knows_person_*.csv</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(data.table) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(igraph) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(plotrix) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">require</span>(bit64) +</span></span><span style="display:flex;"><span>dflist <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">lapply</span>(<span style="color:#a6e22e">commandArgs</span>(trailingOnly <span style="color:#f92672">=</span> <span style="color:#66d9ef">TRUE</span>), fread, sep<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;|&#34;</span>, +</span></span><span style="display:flex;"><span> header<span style="color:#f92672">=</span>T, select<span style="color:#f92672">=</span><span style="color:#ae81ff">1</span><span style="color:#f92672">:</span><span style="color:#ae81ff">2</span>, colClasses<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;integer64&#34;</span>) +</span></span><span style="display:flex;"><span> df <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">rbindlist</span>(dflist) <span style="color:#a6e22e">setNames</span>(df, <span style="color:#a6e22e">c</span>(<span style="color:#e6db74">&#34;P1&#34;</span>, <span style="color:#e6db74">&#34;P2&#34;</span>)) +</span></span><span style="display:flex;"><span>d2 <span style="color:#f92672">&lt;-</span> df[,<span style="color:#a6e22e">length</span>(P2),by<span style="color:#f92672">=</span>P1] +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">pdf</span>(<span style="color:#e6db74">&#34;socialdegreedist.pdf&#34;</span>) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">plot</span>(<span style="color:#a6e22e">ecdf</span>(d2<span style="color:#f92672">$</span>V1),main<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Cummulative distribution #friends per user&#34;</span>, +</span></span><span style="display:flex;"><span> xlab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Number of friends&#34;</span>, ylab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Percentage number of users&#34;</span>, log<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;x&#34;</span>, +</span></span><span style="display:flex;"><span> xlim<span style="color:#f92672">=</span><span style="color:#a6e22e">c</span>(<span style="color:#ae81ff">0.8</span>, <span style="color:#a6e22e">max</span>(d2<span style="color:#f92672">$</span>V1) <span style="color:#f92672">+</span> <span style="color:#ae81ff">20</span>)) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">dev.off</span>() +</span></span></code></pre></div><h3 id="data-correlations">Data Correlations</h3> +<p>Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.</p> +<p><em>Which are the most popular names of a country?</em></p> +<p>We run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, <em>&lsquo;A_country_name&rsquo;</em> is the name of a particular country such as <em>&lsquo;Germany&rsquo;, &lsquo;Netherlands&rsquo;, or &lsquo;Vietnam&rsquo;</em>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> p_lastname, <span style="color:#66d9ef">count</span> (p_lastname) <span style="color:#66d9ef">as</span> namecnt +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">FROM</span> person, country +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> p_placeid <span style="color:#f92672">=</span> ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;A_country_name&#39;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> p_lastname <span style="color:#66d9ef">order</span> <span style="color:#66d9ef">by</span> namecnt <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as <em>Muller</em> is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names&rsquo; distribution may not be exactly the same as the contemporary names&rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.</p> +<p><img src="distribution-germany.png" alt="image"> <br> +Figure 2. Distribution of names in Germany</p> +<p><img src="distribution-netherlands.png" alt=""> <br> +Figure 3. Distribution of names in Netherlands</p> +<p><img src="distribution-vietnam.png" alt=""> <br> +Figure 4. Distribution of names in Vietnam</p> +<p><em>Where my friends are living?</em></p> +<p>We run the following query, which computes the locations of the friends of people living in China.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> fctry.ctry_name, <span style="color:#66d9ef">count</span> (<span style="color:#f92672">*</span>) <span style="color:#66d9ef">from</span> person <span style="color:#66d9ef">self</span>, person +</span></span><span style="display:flex;"><span>friend, country pctry, knows, country fctry +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> pctry.ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;China&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> <span style="color:#66d9ef">self</span>.p_placeid <span style="color:#f92672">=</span> pctry.ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> <span style="color:#66d9ef">self</span>.p_personid <span style="color:#66d9ef">and</span> friend.p_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> fctry.ctry_city <span style="color:#f92672">=</span> friend.p_placeid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> fctry.ctry_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.</p> +<p><img src="chinese-friends.png" alt=""> <br> +Figure 5. Locations of friends of people in China</p> +<p><em>Where my friends are studying?</em></p> +<p>Finally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> o2.o_name, <span style="color:#66d9ef">count</span>(o2.o_name) <span style="color:#66d9ef">from</span> knows, person_university +</span></span><span style="display:flex;"><span>p1, person_university p2, organisation o1, organisation o2 +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> +</span></span><span style="display:flex;"><span> p1.pu_organisationid <span style="color:#f92672">=</span> o1.o_organisationid +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> o1.o_name<span style="color:#f92672">=</span><span style="color:#e6db74">&#39;Hangzhou_International_School&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> p1.pu_personid <span style="color:#66d9ef">and</span> p2.pu_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> p2.pu_organisationid <span style="color:#f92672">=</span> o2.o_organisationid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> o2.o_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).</p> +<p><img src="friends-international-school.png" alt=""> <br> +Figure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.</p> +<table> +<thead> +<tr> +<th>Name</th> +<th># of friends</th> +</tr> +</thead> +<tbody> +<tr> +<td>Hangzhou_International_School</td> +<td>12696</td> +</tr> +<tr> +<td>Anhui_University_of_Science_and_Technology</td> +<td>4071</td> +</tr> +<tr> +<td>China_Jiliang_University</td> +<td>3519</td> +</tr> +<tr> +<td>&hellip;</td> +<td></td> +</tr> +<tr> +<td>Darmstadt_University_of_Applied_Sciences</td> +<td>1</td> +</tr> +<tr> +<td>Calcutta_School_of_Tropical_Medicine</td> +<td>1</td> +</tr> +<tr> +<td>Chettinad_Vidyashram</td> +<td>1</td> +</tr> +<tr> +<td>Women&rsquo;s_College_Shillong</td> +<td>1</td> +</tr> +<tr> +<td>Universitas_Nasional</td> +<td>1</td> +</tr> +</tbody> +</table> +<p>Table 1. Universities where friends of Hangzhou International School students are studying at.</p> +<p>In a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.</p> + + + + + SNB Driver - Part 1 + https://ldbcouncil.org/post/snb-driver-part-1/ + Thu, 27 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-1/ + <p>In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: <a href="https://github.com/ldbc/ldbc_driver/">https://github.com/ldbc/ldbc_driver/</a>. Multiple reference implementations by two vendors are available here: <a href="https://github.com/ldbc/ldbc_snb_implementations">https://github.com/ldbc/ldbc_snb_implementations</a>, and discussion of the schema, data properties, and related content is available here: <a href="https://github.com/ldbc/ldbc_snb_docs">https://github.com/ldbc/ldbc_snb_docs</a>.</p> +<p>The following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.</p> +<h3 id="problem-description">Problem Description</h3> +<p>The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person&rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.</p> +<p>Consider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.</p> +<p>The next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.</p> + + + + + Making Semantic Publishing Execution Rules + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + Tue, 18 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + <p><a href="https://ldbcouncil.org/">LDBC</a> <a href="https://ldbcouncil.org/benchmarks/spb">SPB (Semantic Publishing Benchmark)</a> is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC&rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an <a href="https://www.ontotext.com/products/ontotext-graphdb-owlim/">Ontotext Graph DB</a> deployment. Graph DB was formerly known as Owlim.</p> +<p>So, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.</p> +<p>SPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:</p> +<ul> +<li> +<p>Some queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.</p> +</li> +<li> +<p>Some queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.</p> +</li> +</ul> +<p>These different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.</p> +<p>Another guiding factor of SPB was the BBC&rsquo;s and others&rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.</p> +<p>Normally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:</p> +<ul> +<li> +<p>Updates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.</p> +</li> +<li> +<p>Short queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.</p> +</li> +<li> +<p>Analytics - These cover a large fraction of the dataset and are roughly linear to data size.</p> +</li> +</ul> +<p>A test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.</p> +<p>The benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:</p> +<ul> +<li> +<p>46886 MB of RDF literal text</p> +</li> +<li> +<p>23924 MB of full text index for RDF literals</p> +</li> +<li> +<p>23598 MB of URI strings</p> +</li> +<li> +<p>21981 MB of quads, stored column-wise with default index scheme</p> +</li> +</ul> +<p>Clearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.</p> +<p>Let&rsquo;s now look at a full run at unit scale, i.e. 50M triples.</p> +<p>The run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.</p> +<p>The interactive driver reports:</p> +<pre tabindex="0"><code>Seconds run : 2144 + Editorial: + 2 agents + + 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) + 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) + 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) + + 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) + 39.7122 average operations per second + + Aggregation: + 20 agents + + 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) + 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) + 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) + 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) + 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) + 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) + 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) + 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) + 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) + 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) + 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) + + 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] + + 45318 total retrieval queries (0 timed-out) + 22.5239 average queries per second +</code></pre><p>The analytical driver reports:</p> +<pre tabindex="0"><code>Aggregation: + 2 agents + + 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) + 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) + 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) + 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) + 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) + 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) + 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) + 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) + 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) + 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) + 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) + 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) + 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) + 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) + 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] + + 180 total retrieval queries (2 timed-out) + 0.0862 average queries per second +</code></pre><p>The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)</p> +<p>The SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (<a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack at github.com</a>, <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics</a>).</p> +<p>The minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.</p> +<p>Is this good or bad? I would say that this is possible but better can certainly be accomplished.</p> +<p>The initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.</p> +<p>As an initial comment of the workload mix, I&rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.</p> +<p>Adjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.</p> +<p>In the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.</p> + + + + + Fifth TUC Meeting + https://ldbcouncil.org/event/fifth-tuc-meeting/ + Fri, 14 Nov 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fifth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its fifth Technical User<br> +Community (TUC) meeting.</p> +<p>This will be a one-day event at the National Hellenic Research Institute<br> +in Athens, Greece on <strong>Friday November 14, 2014</strong>.</p> +<h3 id="agenda">Agenda</h3> +<p>10:30 - 11:00 Coffee Break</p> +<p>11:00 - 11:10 Peter Boncz (VUA) Welcome &amp; LDBC project status update (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979841.pptx">Presentation</a>)</p> +<p>11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status</p> +<p>Feedback &amp; Roadmap for SPB &amp; OWLIM (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979839.pdf">Presentation</a>)</p> +<p>11:25 - 11:30 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SPB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979828.pdf">Presentation</a>)</p> +<p>11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback &amp; Roadmap for SNB &amp; Neo4J (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979830.pdf">Presentation</a>)</p> +<p>11:45 - 12:00 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SNB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979829.pdf">Presentation</a>)</p> +<p>12:00 - 12:20 Arnau Prat (UPC) &amp; Andrey Gubichev Status, Feedback &amp; Roadmap for SNB Interactive &amp; Sparksee (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979836.pdf">Presentation</a> ) and Business Intelligence (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979837.pdf">Presentation</a>)</p> +<p>12:20 - 12:40 Tomer Sagi, &ldquo;Experience with SNB and TitanDB at HP&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979838.pptx">Presentation</a> )</p> +<p>12:40 - 13:00 Jakob Nelson, &ldquo;graphbench.org on the SNB datagen&rdquo;</p> +<p>13:00 - 14:30 Lunch Break@Byzantine &amp; Christian Museum (<a href="http://www.byzantinemuseum.gr/en/">link</a>)</p> +<p>14:30 - 14:50 Olaf Hartig, &ldquo;Integrating the Property Graph and RDF data models&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979831.pdf">Presentation</a>)\</p> +<p>Documents: <a href="http://arxiv.org/abs/1409.3288">arxiv/1409.3288</a>, <a href="http://arxiv.org/abs/1406.3399">arxiv/1406.3399</a></p> +<p>14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, &ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979842.pdf">Presentation</a>)</p> +<p>15:10 - 15:30 Evaggelia Pitoura, &ldquo;Historical Queries on Graphs&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979835.pdf">Presentation</a>)</p> +<p>15:30 - 16:00 Coffee Break</p> +<p>16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, &ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979843.pdf">Presentation</a>)</p> +<p>16:20 - 16:40 Gunes Aluc, &ldquo;WatDiv: How to Tune-up your RDF Data Management System&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979832.pdf">Presentation</a>)</p> +<p>16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, &ldquo;Benchmarking @LogicBlox&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979840.pdf">Presentation</a>)</p> +<p>17:00 - 17:15 Hassan Chafi, &ldquo;Oracle Labs Graph Strategy&rdquo;</p> +<p>17:15 - 17:25 Yinglong Xia, &ldquo;Property Graphs for Industry Solution at IBM&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979834.pdf">Presentation</a>)</p> +<p>17:25 - 17:30 Arthur Keen, &ldquo;Short Introduction to SPARQLcity&rdquo;</p> +<p><em><strong>20:30 Dinner @ Konservokouti <a href="https://plus.google.com/114240752029716758955/about?gl=gr&amp;hl=en">(link)</a></strong></em></p> +<p><em><strong>Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion</strong></em></p> +<h4 id="logistics">Logistics</h4> +<p>The meeting will be held at the <a href="http://www.eie.gr/index-en.html">National Hellenic Research Foundation</a> located in <a href="http://www.eie.gr/location-en.html">downtown Athens</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/5964344.gif" alt=""></p> +<h4 id="travel">Travel</h4> +<p>Athens, Greece&rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.</p> +<p>To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).</p> +<p>You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: <a href="http://www.aia.gr/traveler/">http://www.aia.gr/traveler/</a></p> + + + + + Getting Started With the Semantic Publishing Benchmark + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + Sun, 09 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + <p>The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.</p> +<p>The benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:</p> +<ul> +<li>clustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time)</li> +<li>correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time)</li> +</ul> +<p>The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.</p> +<p>Currently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:</p> +<ul> +<li>join ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema</li> +<li>subselects that aggregate the query results that the optimiser should recognise and evaluate first</li> +<li>optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last</li> +<li>reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.)</li> +<li>unions to be executed in parallel</li> +<li>optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results</li> +<li>ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results</li> +<li>handling of geo-spatial predicates</li> +<li>full-text search optimisation</li> +<li>asynchronous execution of the aggregate sub-queries</li> +<li>use of distinct to choose the optimal query plan</li> +</ul> +<p>We give below Query 1 of the Semantic Publishing Benchmark.</p> +<pre tabindex="0"><code>PREFIX bbcevent:&lt;http://www.bbc.co.uk/ontologies/event/&gt; +PREFIX geo-pos:&lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; +PREFIX bbc:&lt;http://www.bbc.co.uk/ontologies/bbc/&gt; +PREFIX time:&lt;http://www.w3.org/2006/time#&gt; +PREFIX event:&lt;http://purl.org/NET/c4dm/event.owl#&gt; +PREFIX music-ont:&lt;http://purl.org/ontology/mo/&gt; +PREFIX rdf:&lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt; +PREFIX foaf:&lt;http://xmlns.com/foaf/0.1/&gt; +PREFIX provenance:&lt;http://www.bbc.co.uk/ontologies/provenance/&gt; +PREFIX owl:&lt;http://www.w3.org/2002/07/owl#&gt; +PREFIX cms:&lt;http://www.bbc.co.uk/ontologies/cms/&gt; +PREFIX news:&lt;http://www.bbc.co.uk/ontologies/news/&gt; +PREFIX cnews:&lt;http://www.bbc.co.uk/ontologies/news/cnews/&gt; +PREFIX cconcepts:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX dbp-prop:&lt;http://dbpedia.org/property/&gt; +PREFIX geonames:&lt;http://sws.geonames.org/&gt; +PREFIX rdfs:&lt;http://www.w3.org/2000/01/rdf-schema#&gt; +PREFIX domain:&lt;http://www.bbc.co.uk/ontologies/domain/&gt; +PREFIX dbpedia:&lt;http://dbpedia.org/resource/&gt; +PREFIX geo-ont:&lt;http://www.geonames.org/ontology#&gt; +PREFIX bbc-pont:&lt;http://purl.org/ontology/po/&gt; +PREFIX tagging:&lt;http://www.bbc.co.uk/ontologies/tagging/&gt; +PREFIX sport:&lt;http://www.bbc.co.uk/ontologies/sport/&gt; +PREFIX skosCore:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX dbp-ont:&lt;http://dbpedia.org/ontology/&gt; +PREFIX xsd:&lt;http://www.w3.org/2001/XMLSchema#&gt; +PREFIX core:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX curric:&lt;http://www.bbc.co.uk/ontologies/curriculum/&gt; +PREFIX skos:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX cwork:&lt;http://www.bbc.co.uk/ontologies/creativework/&gt; +PREFIX fb:&lt;http://rdf.freebase.com/ns/&gt; + +# Query Name : query1 +# Query Description : +# Retrieve creative works about thing t (or that mention t) +# reasoning: rdfs:subClassOf, rdf:type +# join ordering: cwork:dateModified rdf:type owl:FunctionalProperty +# join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty +# Choke Points : +# - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified +# Optimizer should use an efficient cost evaluation method for choosing the optimal join tree +# - A sub-select which aggregates results. Optimizer should recognize it and execute it first +# - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) +# Optimizer should decide to put optional triples on top of the join tree +# (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join +# - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork +# and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ + +CONSTRUCT { + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:shortTitle ?shortTitle ; + cwork:about ?about ; + cwork:mentions ?mentions ; + cwork:dateCreated ?created ; + cwork:dateModified ?modified ; + cwork:description ?description ; + cwork:primaryFormat ?primaryFormat ; + bbc:primaryContentOf ?webDocument . + ?webDocument bbc:webDocumentType ?webDocType . + ?about rdfs:label ?aboutLabel ; + bbc:shortLabel ?aboutShortLabel ; + bbc:preferredLabel ?aboutPreferredLabel . + ?mentions rdfs:label ?mentionsLabel ; + bbc:shortLabel ?mentionsShortLabel ; + bbc:preferredLabel ?mentionsPreferredLabel . + ?creativeWork cwork:thumbnail ?thumbnail . + ?thumbnail a cwork:Thumbnail ; + cwork:altText ?thumbnailAltText ; + cwork:thumbnailType ?thumbnailType . +} +WHERE { + { + SELECT ?creativeWork + WHERE { + ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . + ?creativeWork a cwork:CreativeWork ; + cwork:dateModified ?modified . + } + ORDER BY DESC(?modified) + LIMIT 10 + } + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:dateModified ?modified . + OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } + OPTIONAL { ?creativeWork cwork:description ?description . } + OPTIONAL { ?creativeWork cwork:about ?about . + OPTIONAL { ?about rdfs:label ?aboutLabel . } + OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } + OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } + } + OPTIONAL { + ?creativeWork cwork:mentions ?mentions . + OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } + OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } + OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } + } + OPTIONAL { ?creativeWork cwork:dateCreated ?created . } + OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } + OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . + OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } + OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } + } +} +</code></pre><p>Listing 1. Semantic Publishing Benchmark: Query 1</p> +<p>The benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.</p> +<p>The data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.</p> +<p>The generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).</p> +<p>Below we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.</p> +<pre tabindex="0"><code>Seconds run : 600 + Editorial: + 0 agents + + 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) + 0.0000 average operations per second + + Aggregation: + 8 agents + + 298 Q1 queries + 267 Q2 queries + 243 Q3 queries + 291 Q4 queries + 320 Q5 queries + 286 Q6 queries + 255 Q7 queries + 274 Q8 queries + 271 Q9 queries + + 2505 total retrieval queries + 4.1750 average queries per second +</code></pre><p>Listing 2. A snippet of semantic_publishing_benchmark_results.log</p> +<p>We run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:</p> +<table> +<thead> +<tr> +<th>#triples</th> +<th>Q1</th> +<th>Q2</th> +<th>Q3</th> +<th>Q4</th> +<th>Q5</th> +<th>Q6</th> +<th>Q7</th> +<th>Q8</th> +<th>Q9</th> +<th>#queries</th> +<th>avg. #q. per sec.</th> +</tr> +</thead> +<tbody> +<tr> +<td>10M</td> +<td>298</td> +<td>267</td> +<td>243</td> +<td>291</td> +<td>320</td> +<td>286</td> +<td>255</td> +<td>274</td> +<td>271</td> +<td>2505</td> +<td>41,750</td> +</tr> +<tr> +<td>100M</td> +<td>53</td> +<td>62</td> +<td>51</td> +<td>52</td> +<td>44</td> +<td>62</td> +<td>25</td> +<td>55</td> +<td>45</td> +<td>449</td> +<td>7,483</td> +</tr> +<tr> +<td>1B</td> +<td>34</td> +<td>29</td> +<td>22</td> +<td>24</td> +<td>25</td> +<td>29</td> +<td>0</td> +<td>29</td> +<td>28</td> +<td>220</td> +<td>3,667</td> +</tr> +</tbody> +</table> + + + + + Choke Point Based Benchmark Design + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + <p>The <em>Linked Data Benchmark Council</em> (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its <a href="https://ldbcouncil.org">https://ldbcouncil.org</a> website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (<a href="https://ldbcouncil.org/event/fifth-tuc-meeting">next TUC meeting</a> will be on October 5 in Athens) and indeed in <em>designing benchmarks</em>.</p> +<p>So, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by <a href="http://www.tpc.org/">TPC</a> have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are <em>relevant</em> and <em>representative</em> (address important challenges encountered in practice), <em>understandable</em> , <em>economical</em> (implementable on simple hardware), <em>fair</em> (such as not to favor a particular product or approach), <em>scalable</em>, <em>accepted</em> by the community and <em>public</em> (e.g. all of its software is available in open source). This list stems from Jim Gray&rsquo;s <a href="http://research.microsoft.com/en-us/um/people/gray/BenchmarkHandbook/TOC.htm">Benchmark Handbook</a>. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.</p> +<p>A very important aspect of benchmark development is making sure that the community <em>accepts</em> a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on <strong>fairness</strong> had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its <a href="https://ldbcouncil.org/tags/tuc-meeting/">Technical User Community gatherings</a>) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.</p> +<p>The need for <em>understandability</em> for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.</p> +<p>The <em>economical</em> aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.</p> +<p><em>Representative</em> benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the <a href="http://aksw.org/Projects/DBPSB.html">DBpedia benchmark</a> whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).</p> +<p>The fact that a benchmark can be <em>scaled</em> in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) &ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.</p> +<p>Now, what makes a benchmark <em>relevant</em>? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of <em>&ldquo;choke points&rdquo;</em>: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.</p> +<p>To illustrate what choke points are in more depth, we wrote a <a href="https://ldbcouncil.org/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf">paper in the TPCTC 2013</a> conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. <em><a href="chokepoints.png">This table</a></em> lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):</p> +<p>I would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can <a href="http://www.openlinksw.com/weblog/oerling/?id=1779">trivialize Q18</a>; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.</p> +<p>LDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark <a href="https://ldbcouncil.org/developer/spb">(SPB)</a> and the more graph-focused Social Network Benchmark (<a href="https://ldbcouncil.org/developer/snb">SNB</a>), and <a href="https://groups.google.com/forum/#!forum/ldbcouncil">tell us what you think</a>. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.</p> +<p><em>(for more posts from Peter Boncz, see also <a href="https://databasearchitects.blogspot.com">Database Architects</a>, a blog about data management challenges and techniques written by people who design and implement database systems)</em></p> + + + + + New Website Online LDBC Benchmarks Reach Public Draft + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + <p>The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.</p> +<p>The Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth <a href="https://ldbcouncil.org/event/fifth-tuc-meeting">Technical User Community meeting</a>. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.</p> +<p>You can also see that we created a this new website and a new logo. This website is different from <code>http://ldbc.eu</code> that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.</p> +<p>In the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.</p> + + + + + Social Network Benchmark Goals + https://ldbcouncil.org/post/social-network-benchmark-goals/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/social-network-benchmark-goals/ + <p>Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.</p> +<p>From a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.</p> +<p>With the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, <a href="https://ldbcouncil.org/benchmarks/snb">SNB</a>, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.</p> +<p>The SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">LDBC Github repository</a>. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD programming contest 2014</a>.</p> +<p>The SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.</p> +<p>More details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.</p> + + + + + Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + <p>It is with great pleasure that we announce the new LDBC organisation site at <a href="https://www.ldbcouncil.org">www.ldbcouncil.org</a>. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the <em>benchmarks</em> menu on this site.</p> +<p>Those benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.</p> +<p>While the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.</p> +<p>We want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.</p> +<p>Finally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.</p> +<p>In all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.</p> + + + + + 2nd International Workshop on Benchmarking RDF Systems + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + <p>Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.</p> +<p>More at: <a href="http://events.sti2.at/bersys2014/">http://events.sti2.at/bersys2014/</a></p> + + + + + DATAGEN: Data Generation for the Social Network Benchmark + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + <p>As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 <a href="#references">[1]</a>.</p> +<p>One of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.</p> +<p>For these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:</p> +<ul> +<li><strong>Realism.</strong> The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook <a href="#references">[2]</a>. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated.</li> +<li><strong>Scalability.</strong> Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters.</li> +<li><strong>Determinism.</strong> DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible.</li> +<li><strong>Usability.</strong> LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible.</li> +</ul> +<p>Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (<a href="https://github.com/ldbc/ldbc_snb_datagen)">https://github.com/ldbc/ldbc_snb_datagen)</a>.</p> +<h4 id="references">References</h4> +<p>[1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. &ldquo;S3g2: A scalable structure-correlated social graph generator.&rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.</p> +<p>[2] Prat-Pérez, Arnau, and David Dominguez-Sal. &ldquo;How community-like is the structure of synthetically generated graphs?.&rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.</p> + + + + + Getting Started With SNB + https://ldbcouncil.org/post/getting-started-with-snb/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-snb/ + <p>In a previous blog post titled &ldquo;<a href="https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/">Is SNB like Facebook&rsquo;s LinkBench?</a>&rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.</p> +<h3 id="datagen">DATAGEN</h3> +<p>DATAGEN is the data generator used by all the workloads of SNB. <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/">Here</a> we introduced the design goals that drive the development of DATAGEN, which can be summarized as: <em>Realism, Scalability, Determinism and Usability.</em></p> +<p>DATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.</p> +<p><img src="schema.png" alt="image"></p> +<p>For the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:</p> +<ul> +<li> +<p>Realistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.</p> +</li> +<li> +<p>Correlated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.</p> +</li> +</ul> +<p>DATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.</p> +<p>Finally, DATAGEN outputs two other things:</p> +<ul> +<li> +<p>Update Streams, which will be used in the future to implement updates in the workloads.</p> +</li> +<li> +<p>Substitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.</p> +</li> +</ul> +<p>Configuring and using DATAGEN is easy. Please visit <a href="https://github.com/ldbc/ldbc_snb_datagen">this page</a> for more information.</p> +<h3 id="ldbc-driver">LDBC driver</h3> +<p>SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.</p> +<p>It is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation <a href="https://github.com/ldbc/ldbc_driver/wiki">page</a>.</p> +<p>The test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.</p> +<p>Given a workload consisting of a series of <em>Operations</em>, the test sponsor implements <em>OperationHandlers</em> __ for them. <em>OperationHandlers</em> are responsible of executing instances of an specific operation (query) type. This is done by overriding the method <em>executeOperation</em>(), which receives as input parameter an <em>Operation</em> instance and returns the result. From <em>Operation</em> __ instance, the operation&rsquo;s input parameters can be retrieved, as well as the database connection state.</p> +<p>The database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the <em>Db</em> interface, which consists of three methods: <em>onInit</em>(), <em>onCleanup</em>() and <em>getConnectionState</em>(). <em>onInit</em>() is called before the benchmark is executed, and is responsible of initializing the database and registering the different <em>OperationHandlers</em>. <em>onCleanup</em>() is called after the benchmark has completed. Any resources that need to be released should be released here.</p> +<p>Finally, <em>getConnectionState</em>() returns an instance of <em>DbConnectionState</em>, which encapsulates any state that needs to be shared between <em>OperationHandler</em> instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.</p> +<p>A good example on how to implement the benchmark can be found <a href="https://github.com/ldbc/ldbc_driver/wiki/Implementing%20a%20Database%20Connector">here</a>.</p> +<h3 id="workloads">Workloads</h3> +<p>Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.</p> +<p>Interactive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.</p> +<p>Business Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.</p> +<p>Examples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.</p> +<p>Finally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.</p> +<h3 id="final-remarks">Final remarks</h3> +<p>This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification <a href="https://github.com/ldbc/ldbc_snb_docs">draft</a>, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.</p> + + + + + Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + <p>The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.</p> +<p>The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user&rsquo;s social environment and potentially access data associated with the friends or a user and their friends.</p> +<p>This is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.</p> +<p>SNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.</p> +<p>The challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.</p> +<p>The dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.</p> +<p>The metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.</p> +<p>Different technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API&rsquo;s.</p> +<p>SNB Interactive is an example of LDBC&rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art</p> +<p>The benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.</p> +<p>Subsequent posts will address the workload in more detail.</p> + + + + + Is SNB Like Facebooks LinkBench + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + <p>In this post, I will discuss in some detail the rationale and goals of the design of the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook&rsquo;s own graph benchmark called <a href="https://www.facebook.com/notes/facebook-engineering/linkbench-a-database-benchmark-for-the-social-graph/10151391496443920">LinkBench</a>. We think SNB is the most intricate graph database benchmark to date (it&rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference <a href="http://www.sigmod2014.org/">SIGMOD in Snowbird</a> after being used for this year&rsquo;s <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD Programming Contest</a>, which was about graph analytics.</p> +<p>SNB is intended to provide the following <strong>value</strong> to different stakeholders:</p> +<ul> +<li> +<p>For end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to <em>compare merits of different products</em> and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.</p> +</li> +<li> +<p>For vendors of graph database technology, SNB provides a <em>checklist of features</em> and performance characteristics that helps in product positioning and can serve to guide new development.</p> +</li> +<li> +<p>For researchers, both industrial and academic, the SNB dataset and workload provide <em>interesting challenges</em> in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.</p> +</li> +</ul> +<p>I should clarify that even though the data model of SNB resembles Facebook (and we&rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don&rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the <em>connection patterns</em> between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:</p> +<ul> +<li> +<p>the benchmark scenario should be <strong>understandable</strong> to a large audience, and this audience should also understand the relevance of managing such data.</p> +</li> +<li> +<p>the scenario in the benchmark should cover the complete range of challenges <strong>relevant</strong> for graph data management, according to the benchmark scope.</p> +</li> +<li> +<p>the query challenges in it should be <strong>realistic</strong> in the sense that, though synthetic, similar data and workloads are encountered in practice.</p> +</li> +</ul> +<p>The SNB is in fact three distinct benchmarks with a common dataset, since there are <em>three different workloads</em>. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.</p> +<ul> +<li> +<p><strong>Interactive Workload.</strong> The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j&rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system&rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a> and <a href="https://github.com/ldbc/ldbc_driver">driver software stack</a> are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.</p> +</li> +<li> +<p><strong>Business Intelligence Workload.</strong> There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.</p> +</li> +<li> +<p><strong>Graph Analytics Workload.</strong> This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.</p> +</li> +</ul> +<p>All the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a>. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to <a href="http://people.cs.uchicago.edu/~tga/pubs/sigmod-linkbench-2013.pdf">LinkBench</a>, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the <strong>low-level</strong> MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that <a href="http://borthakur.com/ftp/sigmod2013.pdf">Facebook&rsquo;s IT infrastructure</a> does not store all user data in MySQL and its modified memcached (&quot;<a href="http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/11730-atc13-bronson.pdf">TAO</a>&quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it <strong>does</strong> matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with <em>correlations</em>, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a &ldquo;choke point&rdquo; for graph data management system where LDBC wants to stimulate innovation).</p> + + + + + Making It Interactive + https://ldbcouncil.org/post/making-it-interactive/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-it-interactive/ + <p><em>Synopsis:</em> Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.</p> +<p>It is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.</p> +<p>As part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.</p> +<p>Now, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.</p> +<p>So, LDBC SNB has a twofold task:</p> +<ol> +<li>In order to be a credible interactive workload, it must in fact have characteristics of one</li> +<li>In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics.</li> +</ol> +<p>Designing a workload presents specific challenges:</p> +<ol> +<li>The workload must be realistic enough for users to identify with it.</li> +<li>The workload must pose challenges and drive innovation in a useful direction.</li> +<li>The component operations must all play a noticeable role in it. If the operation&rsquo;s relative performance doe does not affect the score, why is it in the workload?</li> +</ol> +<p>The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.</p> +<p>Very roughly, the choke points (technical challenges) of SNB interactive are as follows:</p> +<ul> +<li>Random access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored.</li> +<li>Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join.</li> +<li>When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations.</li> +<li>Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities.</li> +</ul> +<p>Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.</p> +<p>In online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.</p> +<p>A key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.</p> +<p>The other aspect is the metric, typically some variation on operations per unit of time.</p> +<p>All these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.</p> +<p>So how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.</p> +<p>But SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.</p> +<p>Having this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:</p> +<ul> +<li>Short queries should collectively be about 45% of the CPU load.</li> +<li>Updates will be under 5%</li> +<li>Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others.</li> +</ul> +<p>The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.</p> +<p>There is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.</p> +<p>In the next post we will look at the actual mix and execution times on the test system.</p> + + + + + SNB Data Generator - Getting Started + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">this</a> and <a href="https://ldbcouncil.org/post/getting-started-with-snb">this</a>) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.</p> +<h3 id="getting-and-configuring-hadoop">Getting and Configuring Hadoop</h3> +<p>DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz hadoop-1.2.1.tar.gz +</span></span></code></pre></div><p>For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/wiki/Configuration">LDBC DATAGEN wiki</a>.</p> +<h3 id="getting-and-configuring-datagen">Getting and configuring DATAGEN</h3> +<p>Before downloading DATAGEN, be sure to fulfill the following requirements:</p> +<ul> +<li>Linux based machine</li> +<li>java 1.6 or greater</li> +<li>python 2.7.X</li> +<li>maven 3</li> +</ul> +<p>After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/releases">here</a>. Again, decompress the downloaded file with the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz +</span></span></code></pre></div><p>This will create a folder called “ldbc_snb_datagen-0.1.2”.</p> +<p>DATAGEN provides a <em>run.sh</em> is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>HADOOP_HOME<span style="color:#f92672">=</span>/home/user/hadoop-1.2.1 +</span></span><span style="display:flex;"><span>LDBC_SNB_DATAGEN_HOME<span style="color:#f92672">=</span>/home/user/ldbc_snb_datagen +</span></span></code></pre></div><p>HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute <em>run.sh</em> script to compile and execute DATAGEN using default parameters. Type the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user/ldbc_snb_datagen-0.1.2 +</span></span><span style="display:flex;"><span>$ ./run.sh +</span></span></code></pre></div><p>This will run DATAGEN, and two folders will be created at the same directory: <em>social_network</em> containing the scale factor 1 dataset with csv uncompressed files, and <em>substitution_parameters</em> containing the substituion parameters needed by the driver to execute the benchmark.</p> +<h3 id="changing-the-generated-dataset">Changing the generated dataset</h3> +<p>The characteristics of the dataset to be generated are specified in the <em>params.ini</em> file. By default, this file has the following content:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:1</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:1</span> +</span></span></code></pre></div><p>The following is the list of options and their default values supported by DATAGEN:</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>scaleFactor</td> +<td>1</td> +<td>&ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000&rdquo;</td> +</tr> +<tr> +<td>serializer</td> +<td>csv</td> +<td>&ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl&rdquo;</td> +</tr> +<tr> +<td>compressed</td> +<td>FALSE</td> +<td>Specifies to compress the output data in gzip.</td> +</tr> +<tr> +<td>outputDir</td> +<td>./</td> +<td>Specifies the folder to output the data.</td> +</tr> +<tr> +<td>updateStreams</td> +<td>FALSE</td> +<td>&ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static&rdquo;</td> +</tr> +<tr> +<td>numThreads</td> +<td>1</td> +<td>Sets the number of threads to use. Only works for pseudo-distributed mode</td> +</tr> +</tbody> +</table> +<p>For instance, a possible <em>params.ini</em> file could be the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:30</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:ttl</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>numPersons</td> +<td>-</td> +<td>The number of persons to generate</td> +</tr> +<tr> +<td>numYears</td> +<td>-</td> +<td>The amount of years of activity</td> +</tr> +<tr> +<td>startYear</td> +<td>-</td> +<td>The start year of simulation.</td> +</tr> +</tbody> +</table> +<p>The following is an example of another possible <em>params.ini</em> file</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">numPersons:100000</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numYears:3</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">startYear:2010</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv_merge_foreign</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/">GitHub</a>!</p> + + + + + The Day of Graph Analytics + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + <p><em>Note: consider this post as a continuation of the &ldquo;<a href="https://ldbcouncil.org/post/making-it-interactive">Making it interactive</a>&rdquo; post by Orri Erling.</em></p> +<p>I have now completed the <a href="https://github.com/openlink/virtuoso-opensource">Virtuoso</a> TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. <a href="http://www.tpc.org/tpch/">TPC-H</a> is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.</p> +<p>So, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.</p> +<p>The BI part is like TPC-H, except for adding the following challenges:</p> +<ul> +<li> +<p>Joins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.</p> +</li> +<li> +<p>Transitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.</p> +</li> +<li> +<p>Transitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.</p> +</li> +<li> +<p>Graph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.</p> +</li> +<li> +<p>Running one query with parameters from different buckets, implying different best plan.</p> +</li> +<li> +<p>Data correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.</p> +</li> +<li> +<p>Large intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.</p> +</li> +<li> +<p>More unions and outer joins.</p> +</li> +</ul> +<p>The idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.</p> +<p>For rules and metric, we can use the TPC-H or <a href="http://www.tpc.org/tpcds/default.asp">TPC-DS</a> ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and <a href="http://www.openstreetmap.org/">Open Street Map</a>. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.</p> +<p>Doing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.</p> +<p>As a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or &ldquo;Platonic&rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.</p> +<p>So, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.</p> +<p>The distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.</p> +<p>This brings us to the graph analytics proper, which is often done in BSP style, e.g. <a href="http://es.slideshare.net/shatteredNirvana/pregel-a-system-for-largescale-graph-processing">Pregel</a>, <a href="http://giraph.apache.org">Giraph</a>, <a href="http://uzh.github.io/signal-collect/">Signal-Collect</a>, some but not all <a href="http://ppl.stanford.edu/main/green_marl.html">Green-Marl</a> applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.</p> +<p>With BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as <a href="http://graphbench.org/">graphbench.org</a> but will be adapted to the SNB dataset.</p> +<p>The analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.</p> + + + + + Using LDBC SPB to Find OWLIM Performance Issues + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + Wed, 20 Aug 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + <p>During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (LDBC-SPB) as a part of our development and release process.</p> +<p>First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.</p> +<p>Initially we’ve decided to fix some of the benchmark parameters :</p> +<ul> +<li>the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run.</li> +</ul> +<p>Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.</p> +<p>We’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.</p> +<p>Following two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.</p> +<p><img src="16-2-Performance.png" alt="image"></p> +<p>Figure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.</p> +<p><img src="8-0-Performance.png" alt="image"></p> +<p>Figure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.</p> +<p>Another thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark&rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.</p> +<p>An additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called <em>checkConformance</em>. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.</p> + + + + + Fourth TUC meeting + https://ldbcouncil.org/event/fourth-tuc-meeting/ + Thu, 03 Apr 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fourth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.</p> +<p>This will be a one-day event at CWI in Amsterdam on <em>Thursday April 3, 2014</em>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<p><strong>For presenters please limit your talks to just 15 minutes</strong></p> +<h3 id="agenda">Agenda</h3> +<p><strong>April 3rd</strong></p> +<ul> +<li> +<p>10:00 Peter Boncz (VUA) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506371.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=JYWVgrP1kVY">video</a>: <em>LDBC project status update</em></p> +</li> +<li> +<p>10:20 Norbert Martinez (UPC) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506375.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=4yREJQ3yDr0">video</a>: <em>Status update on the LDBC Social Network Benchmark (SNB) task force</em>.</p> +</li> +<li> +<p>10:50 Alexandru Iosup (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506363.ppt">ppt</a>, <a href="https://www.youtube.com/watch?v=ulT-RFwKpOE">video</a>: <em>Towards Benchmarking Graph-Processing Platforms</em></p> +</li> +<li> +<p>11:10 Mike Bryant (Kings College) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506364.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=KiHRTu9xx0A">video</a>: <em>EHRI Project: Archival Integration with Neo4j</em></p> +</li> +</ul> +<p><strong>11:30 coffee</strong></p> +<ul> +<li> +<p>11:50 Thilo Muth (University of Magdeburg) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506369.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=5xH3UDLP6Oc">video</a>: <em>MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis</em></p> +</li> +<li> +<p>12:10 Davy Suvee (Janssen Pharmaceutica / Johnson &amp; Johnson) – <a href="https://www.youtube.com/watch?v=XN3LRJUfJIU">video</a>: <em>Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph</em></p> +</li> +<li> +<p>12:30 Yongming Luo (TU Eindhoven) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506366.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=g_my3tBB2_s">video</a>: <em>Regularities and dynamics in bisimulation reductions of big graphs</em></p> +</li> +<li> +<p>12:50 Christopher Davis (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506370.pdf">pdf</a>, <a href="https://www.youtube.com/channel/UC6HbzfJ4016Vez-2HKNeDag">video</a>: <em>Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues</em></p> +</li> +</ul> +<p><strong>13:10 - 14:30 lunch @ restaurant Polder</strong></p> +<ul> +<li> +<p>14:30 <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506365.pptx">SPB task force report</a></p> +</li> +<li> +<p>15:00 Bastiaan Bijl (Sysunite) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506373.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=TsCeKDHShMY">video</a>: <em>Using a semantic approach for monitoring applications in large engineering projects</em></p> +</li> +<li> +<p>15:20 Frans Knibbe (Geodan) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506372.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=uAX-m4OewPM">video</a>: <em>Benchmarks for geographical data</em></p> +</li> +<li> +<p>15:40 Armando Stellato (University of Rome, Tor Vergata &amp; UN Food and Agriculture Organization) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506374.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=mfA4csAs72Y">video</a>: <em>VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges</em></p> +</li> +</ul> +<p><strong>16:00 coffee</strong></p> +<ul> +<li> +<p>16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), <a href="https://www.youtube.com/watch?v=ZUDnVw9P_Rc">video</a>:<em>Customer experiences in implementing SKOS-based vocabularymanagement systems</em></p> +</li> +<li> +<p>16:40 Simon Jupp (European Bioinformatics Institute) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506368.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=CgTuOGK92W8">video</a>: <em>[Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]</em></p> +</li> +<li> +<p>17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506381.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=QTc3yOgoEsg">video</a>: <em>Breakmarking UniProt RDF. SPARQL queries that make your database cry&hellip;</em></p> +</li> +<li> +<p>17:20 Rein van &rsquo;t Veer (Digital Heritage Netherlands) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506380.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=2vDrZoskGyQ">video</a> <em>Time and space for heritage</em></p> +</li> +<li> +<p>17:40 <strong>end of meeting</strong></p> +</li> +<li> +<p>19:00 - 21:30 Social Dinner in restaurant Boom</p> +</li> +</ul> +<p><strong>April 4th</strong></p> +<p>LDBC plenary meeting for project partners.</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506362.ppt">Benchmarking Graph-Processing Platforms: A Vision</a> – Alexandru Iosup</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p>The meeting will be held at the Dutch national research institute for computer science and mathematics (<a href="http://www.cwi.nl">CWI</a> - Centrum voor Wiskunde en Informatica). It is located at <a href="http://www.amsterdamsciencepark.nl/">Amsterdam Science Park</a>:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505821.jpg" alt=""></p> +<p>(<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505820.pdf">A5 map</a>)</p> +<h6 id="travel">Travel</h6> +<p><strong>Arriving &amp; departing:</strong></p> +<p>Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, <a href="http://www.schiphol.com/">www.schiphol.nl</a>) that serves all main European carriers and also very many low-fare carriers.</p> +<p><a href="http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane">http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane</a></p> +<p><strong>Trains</strong> (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) &ndash; which station you are also likely arriving at in case of an international train trip.</p> +<p>From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 &ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).</p> +<p><strong>Taxi</strong> is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).</p> +<p><strong>Public transportation</strong> (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.</p> +<p><strong>Only the &ldquo;disposable&rdquo; cards are interesting for you as visitor.</strong></p> +<p>Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.</p> +<p><strong>Getting Around:</strong> the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.</p> +<p><strong>Cars</strong></p> +<p>In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the &ldquo;WCW&rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.</p> +<p><strong>Arriving at CWI:</strong> Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you&rsquo;ll receive a visitor&rsquo;s pass that allows you to enter our building.</p> +<p><strong>Social Dinner</strong></p> +<p>The social dinner will take place at 7pm on April 3 in Restaurant Boom (<a href="http://www.boometenendrinken.nl/">boometenendrinken.nl</a>), Linneausstraat 63, Amsterdam.</p> + + + + + Third TUC Meeting + https://ldbcouncil.org/event/third-tuc-meeting/ + Tue, 19 Nov 2013 08:00:00 +0000 + + https://ldbcouncil.org/event/third-tuc-meeting/ + <p>The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!</p> +<p>This will be a one day event in London on the <strong>19 November 2013</strong> running in collaboration with the <a href="http://www.graphconnect.com/london/">GraphConnect</a> event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: <strong>LDBCTUC</strong>.</p> +<p>The TUC event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology</li> +<li>Industry discussions on the contents of the benchmarks</li> +</ul> +<p>We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.</p> +<p>We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a></li> +<li><a href="#ldbctuc-background">LDBC/TUC Background</a> +<ul> +<li><a href="#social-network-benchmark">Social Network Benchmark</a></li> +<li><a href="#semantic-publishing-benchmark">Semantic Publishing Benchmark</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>November 19th - Public TUC Meeting</strong></p> +<p>8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)</p> +<p>short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)</p> +<p>NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.</p> +<p>10:00 TUC Meeting Opening (Peter Boncz)</p> +<p>10:10 TUC Presentations (RDF Application Descriptions)</p> +<ul> +<li>Johan Hjerling (BBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275669.pdf">BBC Linked Data and the Semantic Publishing Benchmark</a></strong></em></li> +<li>Andreas Both (Unister): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505027.pdf">Ontology-driven applications in an e-commerce context</a></strong></em></li> +<li>Nuno Carvalho (Fujitsu Laboratories Europe): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275666.pdf"><em><strong>Fujitsu RDF use cases and benchmarking requirements</strong></em></a></li> +<li>Robina Clayphan (Europeana): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816977.ppt">Europeana and Open Data</a></strong></em></li> +</ul> +<p>11:30 Semantic Publishing Benchmark (SPB)</p> +<ul> +<li>Venelin Kotsev (Ontotext - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">Semantic Publishing Benchmark Task Force Update</a></strong></em> and <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">report</a></strong></em></li> +</ul> +<p>12:00-13:00 Lunch at the Graph Connect venue</p> +<p><em>Talks During Lunch:</em></p> +<ul> +<li>Pedro Furtado, Jorge Bernardino (Univ. Coimbra): <strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275671.pdf">KEYSTONE Cost Action</a></strong></li> +</ul> +<p>13:00 TUC Presentations (Graph Application Descriptions)</p> +<ul> +<li>Minqi Zhou / Weining Qian (East China Normal University): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275670.pdf">Elastic and realistic social media data generation</a></strong></em></li> +<li>Andrew Sherlock (Shapespace): <em><strong>Shapespace Use Case</strong></em></li> +<li>Sebastian Verheughe (Telenor): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275667.pdf">Real-time Resource Authorization</a></strong></em></li> +</ul> +<p>14:00 Social Network Benchmark (SNB)</p> +<ul> +<li>Norbert Martinez (UPC - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505025.pdf">Social Network Benchmark Task Force Update</a></strong></em> and <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816975.pdf">Report</a></li> +</ul> +<p><em>14:30 Break</em></p> +<p>14:45 TUC Presentations (Graph Analytics)</p> +<ul> +<li>Keith Houck (IBM): <em><strong>Benchmarking experiences with [System G Native Store (tentative title)]</strong></em></li> +<li>Abraham Bernstein (University of Zurich): <em><strong>Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store</strong></em></li> +<li>Luis Ceze (University of Washington): <em><strong>Grappa and GraphBench Status Update</strong></em></li> +</ul> +<p><em>15:45 Break</em></p> +<p>16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*</p> +<ul> +<li>Christian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): <em><strong>CIDOC-CRM</strong></em></li> +<li>Atanas Kiryakov (Ontotext): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275672.pdf">Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM)</a></strong></em></li> +<li>Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275668.pdf">Geographica: A Benchmark for Geospatial RDF Stores</a></strong></em></li> +<li>Xavier Lopez (Oracle): <em><strong>W3C Property Graph progress</strong></em></li> +<li>Thomas Scharrenbach (University Zurich) <em><strong>PCKS: Benchmarking Semantic Flow Processing Systems</strong></em></li> +</ul> +<p>17:20 Meeting Conclusion (Josep Larriba Pey)</p> +<p>17:30 End of TUC meeting</p> +<p>19:00 Social dinner</p> +<p><strong>November 20th - Internal LDBC Meeting</strong></p> +<p>10:00 Start</p> +<p>12:30 <em>End of meeting</em></p> +<ul> +<li>coffee and lunch provided</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p><strong>Date</strong></p> +<p>19th November 2013</p> +<p><strong>Location</strong></p> +<p>The TUC meeting will be held in <strong>The Tower</strong> hotel (<a href="http://goo.gl/qZt8Fz">Google Maps link</a>) approximately 4 minutes walk from the <a href="http://www.graphconnect.com/london/">GraphConnect</a> conference in London.</p> +<p>Getting there</p> +<ul> +<li>From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.</li> +<li>From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554995.pdf">See attached</a>.</li> +</ul> +<h3 id="ldbctuc-background">LDBC/TUC Background</h3> +<p>Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SNB_Report_Nov2013.pdf</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SPB_Report_Nov2013.pdf</a></li> +</ul> +<p>A summary of these efforts can be read below or, for a more detailed account, please refer to: <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554967.pdf">The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort</a>. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.</p> +<h4 id="social-network-benchmark">Social Network Benchmark</h4> +<p>The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:</p> +<ul> +<li>it should be understandable, and the relevance of managing such data should be understandable</li> +<li>it should cover the complete range of interesting challenges, according to the benchmark scope</li> +<li>the queries should be realistic, i.e., similar data and workloads are encountered in practice</li> +</ul> +<p>SNB includes a data generator for creation of synthetic social network data with the following characteristics:</p> +<ul> +<li>data schema is representative of real social networks</li> +<li>data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions</li> +<li>the software generator is easy-to-use, configurable and scalable</li> +</ul> +<p>SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:</p> +<ul> +<li><strong>Interactive</strong> +<ul> +<li>Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.</li> +<li>The targeted systems are expected to be those that offer transactional functionality.</li> +</ul> +</li> +<li><strong>Business Intelligence</strong> +<ul> +<li>Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.</li> +<li>The targeted systems are expected to be those that offer an abstract query language.</li> +</ul> +</li> +<li><strong>Graph Analytics</strong> +<ul> +<li>Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.</li> +<li>Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.</li> +<li>The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.</li> +</ul> +</li> +</ul> +<h4 id="semantic-publishing-benchmark">Semantic Publishing Benchmark</h4> +<p>The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.</p> +<p>The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works &ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.</p> +<p>The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.</p> +<p>Two separate workloads are modeled in SPB:</p> +<ul> +<li><strong>Editorial:</strong> Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.</li> +<li><strong>Aggregation:</strong> Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as &ldquo;dynamic&rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.</li> +</ul> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505026.pdf">Status of the Semantic Publishing Benchmark</a></p> + + + + + Second TUC Meeting + https://ldbcouncil.org/event/second-tuc-meeting/ + Mon, 22 Apr 2013 10:00:00 +0000 + + https://ldbcouncil.org/event/second-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.</p> +<p>This will be a two day event in Munich on the <strong>22/23rd April 2013</strong>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +<li><a href="#venue">Venue</a> +<ul> +<li><a href="#getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</a></li> +<li><a href="#getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</a></li> +<li><a href="#getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</a></li> +</ul> +</li> +<li><a href="#getting-there">Getting there</a></li> +<li><a href="#social-dinner">Social Dinner</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>April 22nd</strong></p> +<p>10:00 <em>Registration.</em><br> +10:30 Josep Lluis Larriba Pey (UPC) - <em>Welcome and Introduction.</em><br> +10:30 Peter Boncz (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687373.pptx">LDBC: goals and status</a></p> +<p><em>Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)</em></p> +<p>11:00 Josep Lluis Larriba Pey (UPC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687372.pdf">Social Network Benchmark Task Force</a><br> +11:30 Gustavo González (Mediapro): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687367.pdf">Graph-based User Modeling through Real-time Social Streams</a><br> +12:00 Klaus Großmann (Dshini): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687365.pdf">Neo4j at Dshini</a></p> +<p>12:30 Lunch</p> +<p><em>Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)</em></p> +<p>13:30 Barry Bishop (Ontotext): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687366.pptx">Semantic Publishing Benchmark Task Force</a><br> +14:00 Dave Rogers (BBC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687364.pptx">Linked Data Platform at the BBC</a><br> +14:30 Edward Thomas (Wolters Kluwer): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687374.pdf">Semantic Publishing at Wolters Kluwer</a></p> +<p>15:00 Coffee break</p> +<p><em>Projects Related to LDBC</em></p> +<p>15:30 Fabian Suchanek (MPI): &ldquo;YAGO: A large knowledge base from Wikipedia and WordNet&rdquo;<br> +16:00 Antonis Loziou (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687375.pptx">The OpenPHACTS approach to data integration</a><br> +16:30 Mirko Kämpf (Brox): &ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case&rdquo;</p> +<p>17:00 <em>End of first day</em></p> +<p>19:00 Social dinner</p> +<p><strong>April 23rd</strong></p> +<p><em>Industry &amp; Hardware Aspects</em></p> +<p>10:00 Xavier Lopez (Oracle): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687384.pdf">Graph Database Performance an Oracle Perspective.pdf</a><br> +10:30 Pedro Trancoso (University of Cyprus): &ldquo;Benchmarking and computer architecture: the research side&rdquo;</p> +<p>11:00 Coffee break</p> +<p><em>Future Steps and TUC feedback session</em></p> +<p>11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force<br> +12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force&quot;</p> +<p>12:30 <em>End of meeting</em></p> +<h3 id="logistics">Logistics</h3> +<h4 id="date">Date</h4> +<p>22nd and 23th April 2013</p> +<h4 id="location">Location</h4> +<p>The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:</p> +<p>LRZ (Leibniz-Rechenzentrum)<br> +Boltzmannstraße 1<br> +85748 Garching, Germany</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi and Subway <a href="http://www.in.tum.de/fileadmin/user_upload/Sonstiges/anfahrt_garching.pdf">Ubahn</a></p> +<h5 id="getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</h5> +<p>Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.</p> +<h5 id="getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</h5> +<ol> +<li> +<p>(except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.</p> +</li> +<li> +<p>S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.</p> +</li> +<li> +<p>Taxi: fare is ca. 30-40 euros.</p> +</li> +</ol> +<p>For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.</p> +<h5 id="getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</h5> +<p>The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.</p> +<p><strong>Finding LRZ@TUM</strong></p> +<p><a href="http://www.openstreetmap.org/?mlat=48.2615702464&amp;mlon=11.6686558264&amp;zoom=32">OpenStreetMap link</a></p> +<p><a href="https://maps.google.com/maps?q=48.2615702464,11.6686558264&amp;spn=0.005,0.005&amp;t=k">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687268.gif" alt=""></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687269.gif" alt=""></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying: Munich</strong> airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.</p> +<p><strong>S-Bahn:</strong> S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.</p> +<p><strong>Taxi:</strong> taxi from the airport to the city center costs approximately 50 euros</p> +<h4 id="social-dinner">Social Dinner</h4> +<p>The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)</p> +<p>Address: Hofbräuhaus, Platzl 9, Munich</p> + + + + + First TUC Meeting + https://ldbcouncil.org/event/first-tuc-meeting/ + Mon, 19 Nov 2012 09:00:00 +0100 + + https://ldbcouncil.org/event/first-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the <strong>19/20th November 2012</strong>.</p> +<p>So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:</p> +<ul> +<li>Introduction by the coordinator and technical director explaining the objectives of the LDBC project</li> +<li>Invitation to users to explain their use-cases and describe the limitations they have found in current technology</li> +<li>Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points</li> +</ul> +<p>The exact agenda will be published here as things get finalised before the event.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#slide">Slide</a> +<ul> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +</ul> +</li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.</p> +<p><strong>Day 1</strong></p> +<p>09:00 Welcome (Location: Aula Master)<br> +09:30 Project overview (Emphasis on task forces?) + Questionnaire results?<br> +10:30 Coffee break<br> +11:00 User talks (To gather information for use cases?)</p> +<p>13:00 Lunch</p> +<p>14:00 User talks (cont.)<br> +15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).<br> +16:00 Task force proposals (consortium)<br> +17:00 Finish first day</p> +<p>20:00 Social dinner</p> +<p><strong>Day 2</strong></p> +<p>10:00 Task force discussion (consortium + TUC)<br> +11:00 Coffe break<br> +11:30 Task force discussion (consortium + TUC)<br> +12:30 Summaries (Task forces, use cases, &hellip;) and actions</p> +<p>13:00 Lunch and farewell</p> +<p>15:00 LDBC Internal meeting</p> +<h3 id="slide">Slide</h3> +<p>Opening session:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686995.pptx">CWI – Peter Boncz</a> – Objectives</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687001.pdf">UPC – Larri</a> – Questionnaire</li> +</ul> +<p>User stories:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686998.pdf">BBC – Jem Rayfield</a></li> +<li>CA Technologies – Victor Muntés</li> +<li>Connected Discovery (Open Phacts) – Bryn Williams-Jones</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687003.pptx">Elsevier – Alan Yagoda</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687000.pptx">ERA7 Bioinformatics – Eduardo Pareja</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687005.pptx">Press Association – Jarred McGinnis</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687004.pptx">RJLee – David Neuer</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686994.pdf">Yale – Lec Maj</a></li> +</ul> +<p>Benchmark proposals:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686991.pdf">Publishing benchmark proposal – Ontotext – Barry Bishop</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687002.pdf">Social Network Benchmark Proposal – UPC – Larri</a></li> +</ul> +<h4 id="logistics">Logistics</h4> +<h5 id="date">Date</h5> +<p>19th and 20th November 2012</p> +<h5 id="location">Location</h5> +<p>The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<p><strong>Finding UPC</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<p><strong>Finding the meeting room</strong></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<p><strong>The locations of the airport and the city centre</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933317.jpg" alt=""></p> +<p><strong>Bus map</strong></p> + + + + + \ No newline at end of file diff --git a/gql-community/lex/index.html b/gql-community/lex/index.html new file mode 100644 index 00000000..e6ff6417 --- /dev/null +++ b/gql-community/lex/index.html @@ -0,0 +1,374 @@ + + + + + LDBC Extended GQL Schema (LEX) working group + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Extended GQL Schema (LEX) working group

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

Group leader: Alastair Green (JCC)

+

Active members:

+
    +
  • Koji Annoura
  • +
  • Michael Behrisch
  • +
  • Stephen Cannan
  • +
  • Alin Deutsch
  • +
  • George Fletcher
  • +
  • Thomas Friesendal
  • +
  • Denise Gosnell
  • +
  • Alastair Green
  • +
  • Cole Greer
  • +
  • Zhihui Guo
  • +
  • Keith Hare
  • +
  • Jan Hidders
  • +
  • Longbin Lai
  • +
  • Heng Lin
  • +
  • Alessandro Mosca
  • +
  • Stefan Plantikow
  • +
  • Yuya Sasaki
  • +
  • Ognjen Savkovic
  • +
  • Michael Schmidt
  • +
  • Dominik Tomaszuk
  • +
  • Yang Xia
  • +
  • Wenyuan Yu
  • +
  • Tao Wang
  • +
  • Dušan Živkovic
  • +
  • and 15+ observers
  • +
+

See the LEX work charter which details the group’s mission, motivation, and scope of work.

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/overview/index.html b/gql-community/overview/index.html new file mode 100644 index 00000000..9d39b9cc --- /dev/null +++ b/gql-community/overview/index.html @@ -0,0 +1,355 @@ + + + + + LDBC Graph Query Working Groups + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

LDBC Graph Query Working Groups

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

LDBC’s working groups investigate research questions on graph query languages and language extensions for graphs including ISO/IEC SQL/PGQ, released in June 2023, and GQL, scheduled to be released in March 2024.

+

Active Working Groups

+ +

Historical Working Groups

+ + +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/1/index.html b/gql-community/page/1/index.html new file mode 100644 index 00000000..7780d3ca --- /dev/null +++ b/gql-community/page/1/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/gql-community/ + + + + + + diff --git a/gql-community/page/2/index.html b/gql-community/page/2/index.html new file mode 100644 index 00000000..ee463c32 --- /dev/null +++ b/gql-community/page/2/index.html @@ -0,0 +1,804 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + + + + +
+
+
+ +

LDBC and Apache Flink

+
Tags:
+ FLINK + , DATAGEN + , SNB + +
+
+ +

Apache Flink [1] is an open source platform for distributed stream and batch data processing. Flink’s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.

+

+

Flink offers multiple APIs to process data …

+ +
+
+ +
+ + +
+
+
+ +

Elements of Instance Matching Benchmarks: a Short Overview

+
Tags:
+ INSTANCE MATCHING + , SPB + +
+
+ +

The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not …

+ +
+
+ +
+ + +
+
+ +
+ +

In this post we will look at running the LDBC SNB on Virtuoso.

+

First, let’s recap what the benchmark is about:

+
    +
  1. +

    fairly frequent short updates, with no update contention worth mentioning

    +
  2. +
  3. +

    short random lookups

    +
  4. +
  5. +

    medium complex queries centered around a person’s social environment

    +
  6. +
+

The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an …

+ +
+
+ +
+ + +
+
+
+ +

SNB and Graphs Related Presentations at GRADES '15

+
Tags:
+ SIGMOD + , GRAPHALYTICS + , GRADES + , SNB + , DATAGEN + , WORKSHOP + +
+
+ +

Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.

+

Among the papers published in this edition we have “Graphalytics: A Big Data Benchmark for Graph-Processing Platforms”, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 2: Modeling Choices

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

​SNB Interactive is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.

+

In the case of Virtuoso, we have played with SQL and SPARQL implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/3/index.html b/gql-community/page/3/index.html new file mode 100644 index 00000000..f5a2a47e --- /dev/null +++ b/gql-community/page/3/index.html @@ -0,0 +1,791 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference

+
Tags:
+ SIGMOD + , GRADES + , SNB + , GRAPHALYTICS + , WORKSHOP + +
+
+ +

LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.

+

On the industry track, LDBC will be presenting the Social Network Benchmark Interactive …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 1: What Is SNB Interactive Really About?

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.

+

With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its …

+ +
+
+ +
+ + +
+
+
+ +

Why Do We Need an LDBC SNB-Specific Workload Driver?

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

In a previous 3-part blog series we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn’t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more …

+ +
+
+ +
+ + +
+
+
+ +

Event Driven Post Generation in Datagen

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.

+

First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), …

+ +
+
+ +
+ + +
+
+
+ +

The LDBC Datagen Community Structure

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.

+

When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and …

+ +
+
+ +
+ + +
+
+
+ +

Industry Relevance of the Semantic Publishing Benchmark

+
Tags:
+ INDUSTRY + , SPB + +
+
+ + + post/industry-relevance-of-the-semantic-publishing-benchmark/01_sf_newspapers.png +
+ +
+ +

Publishing and media businesses are going through transformation

+

I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/4/index.html b/gql-community/page/4/index.html new file mode 100644 index 00000000..9fd09999 --- /dev/null +++ b/gql-community/page/4/index.html @@ -0,0 +1,757 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

OWL-Empowered SPARQL Query Optimization

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution …

+ +
+
+ +
+ + +
+
+
+ +

Person Activity Subgraph Features in LDBC DATAGEN

+
Tags:
+ SNB + , DATAGEN + +
+
+ +

When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 2: Tracking Dependencies Between Queries

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

The SNB Driver part 1 post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we’ll drill down deeper into the details of what it means to execute “dependent queries” during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 3: Workload Execution Putting It All Together

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

Up until now we have introduced the challenges faced when executing the LDBC SNB benchmark, as well as explained how some of these are overcome. With the foundations laid, we can now explain precisely how operations are executed.

+

Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these …

+ +
+
+ +
+ + +
+
+
+ +

Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide

+
Tags:
+ SPB + , SESAME + , RDF + , TUTORIAL + , GUIDE + +
+
+ +

Until now we have discussed several aspects of the Semantic Publishing Benchmark (SPB) such as the difference in performance between virtual and real servers configuration, how to choose an appropriate query mix for a benchmark run and our experience with using SPB in the development process of GraphDB for finding performance issues.

+

In this post we provide a step-by-step guide on how to run SPB using the Sesame RDF data store on a fresh install …

+ +
+
+ +
+ + +
+
+
+ +

Semantic Publishing Instance Matching Benchmark

+
Tags:
+ INSTANCE MATCHING + , BENCHMARK + +
+
+ +

The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.

+

The SPIMBench …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/5/index.html b/gql-community/page/5/index.html new file mode 100644 index 00000000..4dcf2999 --- /dev/null +++ b/gql-community/page/5/index.html @@ -0,0 +1,757 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Further Developments in SNB BI Workload

+
Tags:
+ SNB + , BI + +
+
+ +

We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.

+

As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.

+

There are obvious marketing applications for a SNB-like dataset. There are also security …

+ +
+
+ +
+ + +
+
+
+ +

Sizing AWS Instances for the Semantic Publishing Benchmark

+
Tags:
+ SPB + , AMAZON + , EC2 + , AWS + , RDF + +
+
+ +

LDBC’s Semantic Publishing Benchmark (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: a Realistic Social Network Data Generator

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

In previous posts (Getting started with snb, DATAGEN: data generation for the Social Network Benchmark), Arnau Prat discussed the main features and characteristics of DATAGEN: realism, scalability, determinism, usability. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 1

+
Tags:
+ SNB + , DRIVER + , TPC-C + , INTERACTIVE + +
+
+ +

In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: …

+ +
+
+ +
+ + +
+
+
+ +

Making Semantic Publishing Execution Rules

+
Tags:
+ SPB + , TEST RUN + +
+
+ +

LDBC SPB (Semantic Publishing Benchmark) is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC’s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an Ontotext Graph DB deployment. Graph DB was formerly known as Owlim.

+

So, in SPB we wanted to address substantially more complex queries than the lookups that …

+ +
+
+ +
+ + +
+
+ +
+ +

The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/6/index.html b/gql-community/page/6/index.html new file mode 100644 index 00000000..ccb4a516 --- /dev/null +++ b/gql-community/page/6/index.html @@ -0,0 +1,772 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Choke Point Based Benchmark Design

+
Tags:
+ DATABASE + , BENCHMARK + , DESIGN + +
+
+ +

The Linked Data Benchmark Council (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its https://ldbcouncil.org website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction …

+ +
+
+ +
+ + +
+
+
+ +

New Website Online LDBC Benchmarks Reach Public Draft

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be …

+ +
+
+ +
+ + +
+
+
+ +

Social Network Benchmark Goals

+
Tags:
+ SNB + , DATAGEN + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ +

Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established …

+ +
+
+ +
+ + +
+
+ +
+ +

It is with great pleasure that we announce the new LDBC organisation site at www.ldbcouncil.org. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its …

+ +
+
+ +
+ + +
+
+
+ +

2nd International Workshop on Benchmarking RDF Systems

+
Tags:
+ WORKSHOP + , CFP + , BENCHMARK + , BERSYS + +
+
+ +

Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: Data Generation for the Social Network Benchmark

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 [1].

+

One of the most …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/7/index.html b/gql-community/page/7/index.html new file mode 100644 index 00000000..2d9e4a8e --- /dev/null +++ b/gql-community/page/7/index.html @@ -0,0 +1,787 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Getting Started With SNB

+
Tags:
+ SNB + , INTERACTIVE + , DATAGEN + +
+
+ +

In a previous blog post titled “Is SNB like Facebook’s LinkBench?”, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.

+

DATAGEN

+

DATAGEN is the data generator used by all the workloads of SNB. Here we introduced the …

+ +
+
+ +
+ + +
+
+ +
+ +

The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.

+

The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user’s social environment and potentially access data associated with the friends or a user and their friends.

+

This …

+ +
+
+ +
+ + +
+
+
+ +

Is SNB Like Facebooks LinkBench

+
Tags:
+ DEVELOPER + , SNB + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ + + post/is-snb-like-facebooks-linkbench/SNB-workloads-vs-systems.jpg +
+ +
+ +

In this post, I will discuss in some detail the rationale and goals of the design of the Social Network Benchmark (SNB) and explain how it relates to real …

+ +
+
+ +
+ + +
+
+
+ +

Making It Interactive

+
Tags:
+ SNB + , BENCHMARKING + , TPC + , SPARQL + , INTERACTIVE + +
+
+ +

Synopsis: Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.

+

It is about to be showtime for LDBC. The initial installment of the …

+ +
+
+ +
+ + +
+
+
+ +

SNB Data Generator - Getting Started

+
Tags:
+ DATAGEN + , SNB + , SOCIAL NETWORK + +
+
+ +

In previous posts (this and this) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.

+

Getting and Configuring Hadoop

+

DATAGEN runs on top of hadoop 1.2.1 to be scale. …

+ +
+
+ +
+ + +
+
+
+ +

The Day of Graph Analytics

+
Tags:
+ ANALYTICS + , SNB + +
+
+ +

Note: consider this post as a continuation of the “Making it interactive” post by Orri Erling.

+

I have now completed the Virtuoso TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. TPC-H is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/page/8/index.html b/gql-community/page/8/index.html new file mode 100644 index 00000000..69a8385e --- /dev/null +++ b/gql-community/page/8/index.html @@ -0,0 +1,658 @@ + + + + + Gql communities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Gql-communities

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Using LDBC SPB to Find OWLIM Performance Issues

+
Tags:
+ LDBC + , SPB + , RDF + +
+
+ +

During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC Semantic Publishing Benchmark (LDBC-SPB) as a part of our development and release process.

+

First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.

+

Initially we’ve decided to fix some of the benchmark parameters :

+
    +
  • the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup …
+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/gql-community/pgswg/index.html b/gql-community/pgswg/index.html new file mode 100644 index 00000000..533538b5 --- /dev/null +++ b/gql-community/pgswg/index.html @@ -0,0 +1,495 @@ + + + + + Property Graph Schema Working Group (PGSWG) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Property Graph Schema Working Group (PGSWG)

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+

Group leaders: Jan Hidders (Birkbeck College, University of London), Juan Sequeda (data.world)

+

The PGSWG has 4 sub-groups: PG-Basic, PG-Constraints, PG-Properties, PG-Nulls

+

PG-Basic

+
    +
  • Group leader: Jan Hidders (Birkbeck College, University of London)
  • +
  • Focus: Basic constructs and semantics
  • +
+

Group members

+
    +
  • Alastair Green (JCC Consulting; Birkbeck College, University of London)
  • +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Bei Li (Google)
  • +
  • Dominik Tomaszuk (University of Bialystok)
  • +
  • Enrico Franconi (Free University of Bozen-Bolzano)
  • +
  • George Fletcher (Eindhoven TU)
  • +
  • Gilles Privat (Orange S.A.)
  • +
  • Hannes Voigt (Neo4j)
  • +
  • Harsh Thakkar (Consultant OSTHUS GmBH)
  • +
  • Jan Hidders (Birkbeck College, University of London)
  • +
  • Jason Crawford (Amazon)
  • +
  • Josh Perryman (VeracityID)
  • +
  • Joshua Shinavier (LinkedIn)
  • +
  • Juan Sequeda (data.world)
  • +
  • Keith W. Hare (JCC Consulting)
  • +
  • Koji Annoura (UTI, Inc.)
  • +
  • Leonid Libkin (ENS Paris, University of Edinburgh)
  • +
  • Liat Peterfreund (ENS Paris)
  • +
  • Michael Schmidt (Amazon Web Services)
  • +
  • Renzo Angles (Universidad de Talca)
  • +
  • Slawek Staworko (Université de Lille)
  • +
  • Stefania Dumbrava (Ecole Nationale Supérieure d’Informatique pour l’Industrie et l’Entreprise (ENSIIE))
  • +
  • Victor Lee (TigerGraph)
  • +
  • Victor Marsault (CNRS)
  • +
  • Wim Martens (University of Bayreuth)
  • +
  • Wook-Shin Han (POSTECH)
  • +
+

PG-Constraints

+
    +
  • Group leader: George Fletcher (TU Eindhoven)
  • +
  • Focus: Key constraints and cardinality constraints
  • +
+

Group members

+
    +
  • Alastair Green (JCC Consulting; Birkbeck College, University of London)
  • +
  • Andrea Cali (Birkbeck College, University of London)
  • +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Bei Li (Google)
  • +
  • Borislav Iordanov (Kobrix)
  • +
  • Dominik Tomaszuk (University of Bialystok)
  • +
  • Enrico Franconi (Free University of Bozen-Bolzano)
  • +
  • Filip Murlak (University of Warsaw)
  • +
  • George Fletcher (Eindhoven TU)
  • +
  • Jan Hidders (Birkbeck College, University of London)
  • +
  • Jason Crawford (Amazon)
  • +
  • Josh Perryman (VeracityID)
  • +
  • Juan Sequeda (data.world)
  • +
  • Keith W. Hare (JCC Consulting)
  • +
  • Koji Annoura (UTI, Inc.)
  • +
  • Leonid Libkin (ENS Paris, University of Edinburgh)
  • +
  • Michael Schmidt (Amazon Web Services)
  • +
  • Slawek Staworko (Université de Lille)
  • +
  • Stefania Dumbrava (Ecole Nationale Supérieure d’Informatique pour l’Industrie et l’Entreprise (ENSIIE))
  • +
  • Wim Martens (University of Bayreuth)
  • +
  • Wook-Shin Han (POSTECH)
  • +
+

PG-Properties

+
    +
  • Group leader: Joshua Shinavier (LinkedIn)
  • +
  • Focus: Data types for properties
  • +
+

Group members

+
    +
  • Alastair Green (JCC Consulting; Birkbeck College, University of London)
  • +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Bei Li (Google)
  • +
  • Borislav Iordanov (Kobrix)
  • +
  • Dominik Tomaszuk (University of Bialystok)
  • +
  • Enrico Franconi (Free University of Bozen-Bolzano)
  • +
  • Filip Murlak (University of Warsaw)
  • +
  • George Fletcher (Eindhoven TU)
  • +
  • Gilles Privat (Orange S.A.)
  • +
  • Harsh Thakkar (Consultant OSTHUS GmBH)
  • +
  • Jan Hidders (Birkbeck College, University of London)
  • +
  • Jason Crawford (Amazon)
  • +
  • Josh Perryman (VeracityID)
  • +
  • Joshua Shinavier (LinkedIn)
  • +
  • Juan Sequeda (data.world)
  • +
  • Keith W. Hare (JCC Consulting)
  • +
  • Koji Annoura (UTI, Inc.)
  • +
  • Michael Schmidt (Amazon Web Services)
  • +
  • Renzo Angles (Universidad de Talca)
  • +
  • Stefania Dumbrava (Ecole Nationale Supérieure d’Informatique pour l’Industrie et l’Entreprise (ENSIIE))
  • +
  • Victor Lee (TigerGraph)
  • +
  • Victor Marsault (CNRS)
  • +
  • Wim Martens (University of Bayreuth)
  • +
  • Wook-Shin Han (POSTECH)
  • +
+

PG-Nulls

+
    +
  • Group leader: Angela Bonifati (Lyon 1 University)
  • +
  • Focus: Null values
  • +
+

Group members

+
    +
  • Alastair Green (JCC Consulting; Birkbeck College, University of London)
  • +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Dominik Tomaszuk (University of Bialystok)
  • +
  • Enrico Franconi (Free University of Bozen-Bolzano)
  • +
  • Filip Murlak (University of Warsaw)
  • +
  • Gilles Privat (Orange S.A.)
  • +
  • Jan Hidders (Birkbeck College, University of London)
  • +
  • Joshua Shinavier (LinkedIn)
  • +
  • Juan Sequeda (data.world)
  • +
  • Koji Annoura (UTI, Inc.)
  • +
  • Leonid Libkin (ENS Paris, University of Edinburgh)
  • +
  • Liat Peterfreund (ENS Paris)
  • +
  • Michael Schmidt (Amazon Web Services)
  • +
  • Paolo Guagliardo (University of Edinburgh)
  • +
  • Slawek Staworko (Université de Lille)
  • +
  • Stefania Dumbrava (Ecole Nationale Supérieure d’Informatique pour l’Industrie et l’Entreprise (ENSIIE))
  • +
  • Victor Lee (TigerGraph)
  • +
  • Wim Martens (University of Bayreuth)
  • +
  • Wook-Shin Han (POSTECH)
  • +
+

Threshold queries

+
    +
  • Angela Bonifati (Lyon 1 University)
  • +
  • Dominik Tomaszuk (University of Bialystok)
  • +
  • Filip Murlak (University of Warsaw)
  • +
  • George Fletcher (Eindhoven TU)
  • +
  • Jan Hidders (Birkbeck College, University of London)
  • +
  • Joshua Shinavier (LinkedIn)
  • +
  • Matthias Hofer (University of Bayreuth)
  • +
  • Slawek Staworko (Université de Lille)
  • +
  • Stefania Dumbrava (Ecole Nationale Supérieure d’Informatique pour l’Industrie et l’Entreprise (ENSIIE))
  • +
  • Wim Martens (University of Bayreuth)
  • +
+ +
+
+ +
+
+
+
+
+
+
+
+

Latest Working Group Updates

+
+
+

+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/images/cube_1.png b/images/cube_1.png new file mode 100644 index 00000000..4aa4affc Binary files /dev/null and b/images/cube_1.png differ diff --git a/images/cube_2.png b/images/cube_2.png new file mode 100644 index 00000000..24c0f989 Binary files /dev/null and b/images/cube_2.png differ diff --git a/images/cube_full.png b/images/cube_full.png new file mode 100644 index 00000000..4494f35b Binary files /dev/null and b/images/cube_full.png differ diff --git a/images/favicon.ico b/images/favicon.ico new file mode 100644 index 00000000..30ec30d8 Binary files /dev/null and b/images/favicon.ico differ diff --git a/images/favicon.png b/images/favicon.png new file mode 100644 index 00000000..3df75eb3 Binary files /dev/null and b/images/favicon.png differ diff --git a/images/ldbc-title.png b/images/ldbc-title.png new file mode 100644 index 00000000..a09f58fb Binary files /dev/null and b/images/ldbc-title.png differ diff --git a/images/ldbc.png b/images/ldbc.png new file mode 100644 index 00000000..3df75eb3 Binary files /dev/null and b/images/ldbc.png differ diff --git a/images/members/alibaba-damo-academy.png b/images/members/alibaba-damo-academy.png new file mode 100644 index 00000000..3cb019aa Binary files /dev/null and b/images/members/alibaba-damo-academy.png differ diff --git a/images/members/antgroup.png b/images/members/antgroup.png new file mode 100644 index 00000000..530d1e31 Binary files /dev/null and b/images/members/antgroup.png differ diff --git a/images/members/arangodb.png b/images/members/arangodb.png new file mode 100644 index 00000000..2bf688a9 Binary files /dev/null and b/images/members/arangodb.png differ diff --git a/images/members/aws.png b/images/members/aws.png new file mode 100644 index 00000000..50bd7486 Binary files /dev/null and b/images/members/aws.png differ diff --git a/images/members/birkbeck.jpg b/images/members/birkbeck.jpg new file mode 100644 index 00000000..944188d0 Binary files /dev/null and b/images/members/birkbeck.jpg differ diff --git a/images/members/createlink.png b/images/members/createlink.png new file mode 100644 index 00000000..c0b31a29 Binary files /dev/null and b/images/members/createlink.png differ diff --git a/images/members/ens-paris.png b/images/members/ens-paris.png new file mode 100644 index 00000000..70ab294f Binary files /dev/null and b/images/members/ens-paris.png differ diff --git a/images/members/fabarta.jpg b/images/members/fabarta.jpg new file mode 100644 index 00000000..9fe574a4 Binary files /dev/null and b/images/members/fabarta.jpg differ diff --git a/images/members/forth.png b/images/members/forth.png new file mode 100644 index 00000000..ef3b6ac3 Binary files /dev/null and b/images/members/forth.png differ diff --git a/images/members/intel.png b/images/members/intel.png new file mode 100644 index 00000000..a53621a4 Binary files /dev/null and b/images/members/intel.png differ diff --git a/images/members/jcc.png b/images/members/jcc.png new file mode 100644 index 00000000..ad2acd0c Binary files /dev/null and b/images/members/jcc.png differ diff --git a/images/members/katana-graph.jpg b/images/members/katana-graph.jpg new file mode 100644 index 00000000..9e33f038 Binary files /dev/null and b/images/members/katana-graph.jpg differ diff --git a/images/members/memgraph.png b/images/members/memgraph.png new file mode 100644 index 00000000..275528ca Binary files /dev/null and b/images/members/memgraph.png differ diff --git a/images/members/nebulagraph.png b/images/members/nebulagraph.png new file mode 100644 index 00000000..34fcb91d Binary files /dev/null and b/images/members/nebulagraph.png differ diff --git a/images/members/neo4j.png b/images/members/neo4j.png new file mode 100644 index 00000000..c4c75320 Binary files /dev/null and b/images/members/neo4j.png differ diff --git a/images/members/ontotext.jpg b/images/members/ontotext.jpg new file mode 100644 index 00000000..1692fb75 Binary files /dev/null and b/images/members/ontotext.jpg differ diff --git a/images/members/oracle-labs.png b/images/members/oracle-labs.png new file mode 100644 index 00000000..86fe547e Binary files /dev/null and b/images/members/oracle-labs.png differ diff --git a/images/members/pometry.png b/images/members/pometry.png new file mode 100644 index 00000000..53333010 Binary files /dev/null and b/images/members/pometry.png differ diff --git a/images/members/redis-labs.png b/images/members/redis-labs.png new file mode 100644 index 00000000..f86a9d3b Binary files /dev/null and b/images/members/redis-labs.png differ diff --git a/images/members/relationalai.png b/images/members/relationalai.png new file mode 100644 index 00000000..7cdbe348 Binary files /dev/null and b/images/members/relationalai.png differ diff --git a/images/members/sparsity.png b/images/members/sparsity.png new file mode 100644 index 00000000..a2926454 Binary files /dev/null and b/images/members/sparsity.png differ diff --git a/images/members/stargraph.png b/images/members/stargraph.png new file mode 100644 index 00000000..75564389 Binary files /dev/null and b/images/members/stargraph.png differ diff --git a/images/members/tigergraph.png b/images/members/tigergraph.png new file mode 100644 index 00000000..fc8e0984 Binary files /dev/null and b/images/members/tigergraph.png differ diff --git a/images/members/ultipa.png b/images/members/ultipa.png new file mode 100644 index 00000000..61f8afc8 Binary files /dev/null and b/images/members/ultipa.png differ diff --git a/images/members/volcengine.png b/images/members/volcengine.png new file mode 100644 index 00000000..3d2c9371 Binary files /dev/null and b/images/members/volcengine.png differ diff --git a/images/members/zhejiang-lab.png b/images/members/zhejiang-lab.png new file mode 100644 index 00000000..b00c0def Binary files /dev/null and b/images/members/zhejiang-lab.png differ diff --git a/images/slide-image-1.jpg b/images/slide-image-1.jpg new file mode 100644 index 00000000..17ff8d92 Binary files /dev/null and b/images/slide-image-1.jpg differ diff --git a/images/slide-image-2.jpg b/images/slide-image-2.jpg new file mode 100644 index 00000000..af1cac9f Binary files /dev/null and b/images/slide-image-2.jpg differ diff --git a/images/slide-image-3.jpg b/images/slide-image-3.jpg new file mode 100644 index 00000000..970ebbe7 Binary files /dev/null and b/images/slide-image-3.jpg differ diff --git a/images/slide-image-4.jpg b/images/slide-image-4.jpg new file mode 100644 index 00000000..880ea210 Binary files /dev/null and b/images/slide-image-4.jpg differ diff --git a/images/steering-committee-pictures/alastair-green.jpg b/images/steering-committee-pictures/alastair-green.jpg new file mode 100644 index 00000000..4a4fa88f Binary files /dev/null and b/images/steering-committee-pictures/alastair-green.jpg differ diff --git a/images/steering-committee-pictures/gabor-szarnyas.jpg b/images/steering-committee-pictures/gabor-szarnyas.jpg new file mode 100644 index 00000000..a47d1d3e Binary files /dev/null and b/images/steering-committee-pictures/gabor-szarnyas.jpg differ diff --git a/images/steering-committee-pictures/judy-bingham.jpg b/images/steering-committee-pictures/judy-bingham.jpg new file mode 100644 index 00000000..d3acc221 Binary files /dev/null and b/images/steering-committee-pictures/judy-bingham.jpg differ diff --git a/images/steering-committee-pictures/oskar-van-rest.jpg b/images/steering-committee-pictures/oskar-van-rest.jpg new file mode 100644 index 00000000..a2a58161 Binary files /dev/null and b/images/steering-committee-pictures/oskar-van-rest.jpg differ diff --git a/images/steering-committee-pictures/peter-boncz.jpg b/images/steering-committee-pictures/peter-boncz.jpg new file mode 100644 index 00000000..49b1bb77 Binary files /dev/null and b/images/steering-committee-pictures/peter-boncz.jpg differ diff --git a/images/steering-committee-pictures/shipeng-qi.jpg b/images/steering-committee-pictures/shipeng-qi.jpg new file mode 100644 index 00000000..e41ea82f Binary files /dev/null and b/images/steering-committee-pictures/shipeng-qi.jpg differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..ac5e4d31 --- /dev/null +++ b/index.html @@ -0,0 +1,487 @@ + + + + + Linked Data Benchmark Council + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + + + + + +
+
+ + +
+
+
+
+

Welcome to the LDBC website

+

On this website, you can find information on benchmark specifications and audited results, on how the LDBC organization works, and benchmarking resources.

+

+ + Read More + +
+
+
+
+ + +
+
+
+
+

Community engagement

+

LDBC brings industry and users together for developing benchmarks whereby the state-of-the-art and advances in graph database technologies can be assessed and directed.

+

+ + Read More + +
+
+
+
+ + +
+
+
+
+

Benchmarks

+

LDBC’s benchmarks include the Social Network Benchmark suite, the Semantic Publishing Benchmark, and Graphalytics.

+

+ + Read More + +
+
+
+
+ + +
+
+
+
+

Industry

+

On these pages, we provide information on the LDBC organization and its members, inclusive information on how to become a member.

+

+ + Read More + +
+
+
+
+ +
+
+ + + + + + + + + + + + + +
+
+
+
+
+

Latest Updates

+
+
+

+ + +
+
+ +

Announcing the Official Release of LDBC Financial Benchmark v0.1.0

+
+ 27 Jun 2023 +

We are delighted to announce the official release of the initial version (v0.1.0) of Financial Benchmark (FinBench).

+

The Financial Benchmark (FinBench) project defines a graph database benchmark targeting …

+
+ + +
+ +
+
+ +

Sixteenth TUC Meeting

+
+ 14 Feb 2023 +

Organizers: Oskar van Rest, Alastair Green, Gábor Szárnyas

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2023 on June 23-24 (Friday-Saturday).

+

The program consists of 10- and 15-minute talks …

+
+ + +
+ +
+
+ +

LDBC SNB – Early 2023 updates

+
+ 15 Feb 2023 +

2023 has been an eventful year for us so far. Here is a summary of our recent activities.

+
    +
  1. +

    Our paper The LDBC Social Network Benchmark: Business Intelligence Workload was published in PVLDB.

    +
  2. +
  3. +

    David Püroja …

+
+ + +
+ +
+
+ +

LDBC SNB Datagen – The winding path to SF100K

+
+ 13 Sep 2022 +

LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, …

+
+ + +
+ +
+
+ +

Fifteenth TUC Meeting

+
+ 05 Apr 2022 +

Organizers: Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green

+

LDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2022 on June 17-18 (Friday-Saturday).

+

The program consists of 10-15 minute …

+
+ + +
+ +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/index.json b/index.json new file mode 100644 index 00000000..16000ba1 --- /dev/null +++ b/index.json @@ -0,0 +1 @@ +[{"categories":null,"contents":"We are delighted to announce the official release of the initial version (v0.1.0) of Financial Benchmark (FinBench).\nThe Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the LDBC FinBench Task Force. The benchmark has one workload currently, Transaction Workload, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.\nCompared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the slides in the 16th TUC. The Financial Benchmark\u0026rsquo;s specification can be found on arXiv.\nThe release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.\nIf you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.\n","permalink":"https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/","tags":["finbench"],"title":"Announcing the Official Release of LDBC Financial Benchmark v0.1.0"},{"categories":null,"contents":"Organizers: Oskar van Rest, Alastair Green, Gábor Szárnyas\nLDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2023 on June 23-24 (Friday-Saturday).\nThe program consists of 10- and 15-minute talks followed by a Q\u0026amp;A session. The talks will be recorded and made available online. If you would like to participate please register using our form.\nLDBC will host a social event on Friday at the Black Bottle gastrotavern in Belltown: 2600 1st Ave (on the corner of Vine), Seattle, WA 98121.\nIn addition, AWS will host a Happy Hour (rooftop grill with beverages) on Saturday on the Amazon Nitro South building\u0026rsquo;s 8th floor deck: 2205 8th Ave, Seattle, WA 98121.\nProgram All times are in PDT.\nFriday Location: Hyatt Regency Bellevue on Seattle\u0026rsquo;s Eastside, room Grand K, co-located with SIGMOD (900 Bellevue Way NE, Bellevue, WA 98004-4272)\nstart finish speaker title 08:30 08:45 Oskar van Rest (Oracle) LDBC – State of the union – slides, video 08:50 09:05 Keith Hare (JCC / WG3) An update on the GQL \u0026amp; SQL/PGQ standards efforts – slides, video 09:10 09:25 Stefan Plantikow (Neo4j / WG3) GQL - Introduction to a new query language standard – slides 09:30 09:45 Leonid Libkin (University of Edinburgh \u0026amp; RelationalAI) Formalizing GQL – slides, video 09:50 10:05 Semen Panenkov (JetBrains Research) Mechanizing the GQL semantics in Coq – slides, videos 10:10 10:25 Oskar van Rest (Oracle) SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – slides, video 10:30 11:00 coffee break 11:00 11:15 Alastair Green (JCC) LDBC\u0026rsquo;s organizational changes and fair use policies – slides 11:20 11:35 Ioana Manolescu (INRIA) Integrating Connection Search in Graph Queries – slides, video 11:40 11:55 Maciej Besta (ETH Zurich) Neural Graph Databases with Graph Neural Networks – video 12:00 12:10 Longbin Lai (Alibaba Damo Academy) To Revisit Benchmarking Graph Analytics – slides, video 12:15 13:30 lunch 13:30 13:45 Yuanyuan Tian (Gray Systems Lab, Microsoft) The World of Graph Databases from An Industry Perspective – slides, video 13:50 14:05 Alin Deutsch (UC San Diego \u0026amp; TigerGraph) TigerGraph\u0026rsquo;s Parallel Computation Model – slides, video 14:10 14:25 Chen Zhang (CreateLink) Applications of a Native Distributed Graph Database in the Financial Industry – video 14:30 14:45 Ricky Sun (Ultipa) Design of highly scalable graph database systems – slides, video 14:50 15:30 coffee break 15:30 15:45 Heng Lin (Ant Group) The LDBC SNB implementation in TuGraph – slides, video 15:50 16:05 Shipeng Qi (Ant Group) FinBench: The new LDBC benchmark targeting financial scenario – slides, video 16:10 17:00 host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft) FinBench panel – slides 19:00 22:00 dinner Black Bottle gastrotavern in Belltown: 2600 1st Ave (on the corner of Vine), Seattle, WA 98121 Saturday Location: Amazon Nitro South building, room 03.204 (2205 8th Ave, Seattle, WA 98121)\nstart finish speaker title 09:00 09:45 Brad Bebee (AWS) Customers don\u0026rsquo;t want a graph database, so why are we still here? – slides, video 10:00 10:15 Muhammad Attahir Jibril (TU Ilmenau) Fast and Efficient Update Handling for Graph H2TAP – slides, video 10:20 11:00 coffee break 11:00 11:15 Gabor Szarnyas (CWI) LDBC Social Network Benchmark and Graphalytics – slides 11:20 11:30 Atanas Kiryakov and Tomas Kovachev (Ontotext) GraphDB – Benchmarking against LDBC SNB \u0026amp; SPB – slides, video 11:35 11:50 Roi Lipman (Redis Labs) Delta sparse matrices within RedisGraph – slides, video 11:55 12:05 Rathijit Sen (Microsoft) Microarchitectural Analysis of Graph BI Queries on RDBMS – slides, video 12:10 13:30 lunch on your own 13:30 13:45 Alastair Green (JCC) LEX \u0026ndash; LDBC Extended GQL Schema – slides, video 13:50 14:05 Ora Lassila (AWS) Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – slides, video 14:10 14:25 Jan Hidders (Birkbeck, University of London) PG-Schema: a proposal for a schema language for property graphs – slides, video 14:30 14:45 Max de Marzi (RageDB and RelationalAI) RageDB: Building a Graph Database in Anger – slides, video 14:50 15:30 coffee break 15:30 15:45 Umit Catalyurek (AWS) HPC Graph Analytics on the OneGraph Model – slides, video 15:50 16:05 David J. Haglin (Trovares) How LDBC impacts Trovares – slides, video 16:10 16:25 Wenyuan Yu (Alibaba Damo Academy) GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – slides, video 16:30 16:40 Scott McMillan (Carnegie Mellon University) Graph processing using GraphBLAS – slides, video 16:45 16:55 Tim Mattson (Intel) Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – slides 17:00 20:00 happy hour (rooftop grill with beverages) on the Nitro South building\u0026rsquo;s 8th floor deck TUC event locations A map of the LDBC TUC events we hosted so far.\n","permalink":"https://ldbcouncil.org/event/sixteenth-tuc-meeting/","tags":["TUC Meeting"],"title":"Sixteenth TUC Meeting"},{"categories":null,"contents":"2023 has been an eventful year for us so far. Here is a summary of our recent activities.\nOur paper The LDBC Social Network Benchmark: Business Intelligence Workload was published in PVLDB.\nDavid Püroja just completed his MSc thesis on creating a design towards SNB Interactive v2 at CWI\u0026rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference\u0026rsquo;s graph developer room titled The LDBC Social Network Benchmark (YouTube mirror).\nI gave a lightning talk at FOSDEM\u0026rsquo;s HPC developer room titled The LDBC Benchmark Suite (YouTube mirror).\nOur auditors have successfully benchmark a number of systems:\nSPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze) SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja) SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr) The results and the full disclosure reports are available under the SPB and SNB benchmark pages.\n","permalink":"https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/","tags":["datagen","snb"],"title":"LDBC SNB – Early 2023 updates"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/david-puroja-msc/","tags":[],"title":"LDBC Social Network Benchmark Interactive v2"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/snb-work-charter/","tags":[],"title":"LDBC Social Network Benchmark task force work charter"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-2306-15975/","tags":[],"title":"The LDBC Financial Benchmark"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/tpctc-ldbc-snb-interactive-v-2/","tags":[],"title":"The LDBC Social Network Benchmark Interactive workload v2: A transactional graph query benchmark with deep delete operations"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/tpctc-ldbc-organization/","tags":[],"title":"The Linked Data Benchmark Council (LDBC): Driving competition and collaboration in the graph data management space"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-wc-wc-2022-02/","tags":[],"title":"LDBC Extended GQL Schema (LEX) Work Charter"},{"categories":null,"contents":"LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my last technical update on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark\u0026rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.\nMoving to SparkSQL We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL\u0026rsquo;s capabilites.\nSpark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.\nDealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees1, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators\u0026rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:\ntable manipulations related to shaping the output into the supported layouts and data types as set forth in the specification; deriving the Interactive and BI datasets; and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes. We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.\nThe diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there\u0026rsquo;s no simple way to avoid\nit, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.\nI\u0026rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the previous blogpost in the series or the Interactive benchmark specification.\nTransformation pipeline The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:\nexplodes edges and / or attributes into separate tables, subsets the snapshot part and creates insert / delete batches for the BI workload, subsets the snapshot part for the Interactive workload, applies formatting related options such as date time representation, serializes the data to a Spark supported format (CSV, Parquet), We utilize a flexible data pipeline that operates on the graph.\ntrait Transform[M1 \u0026lt;: Mode, M2 \u0026lt;: Mode] extends (Graph[M1] =\u0026gt; Graph[M2]) { type In = Graph[M1] type Out = Graph[M2] def transform(input: In): Out override def apply(v: Graph[M1]): Graph[M2] = transform(v) } The Transform trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let\u0026rsquo;s see some of the transformations we have.\ncase class RawToBiTransform(mode: BI, simulationStart: Long, simulationEnd: Long, keepImplicitDeletes: Boolean) extends Transform[Mode.Raw.type, Mode.BI] { override def transform(input: In): Out = ??? } case class RawToInteractiveTransform(mode: Mode.Interactive, simulationStart: Long, simulationEnd: Long) extends Transform[Mode.Raw.type, Mode.Interactive] { override def transform(input: In): Out = ??? } object ExplodeEdges extends Transform[Mode.Raw.type, Mode.Raw.type] { override def transform(input: In): Out = ??? } object ExplodeAttrs extends Transform[Mode.Raw.type, Mode.Raw.type] { override def transform(input: In): Out = ??? } Therefore, a transformation pipeline may look like this:\nval transform = ExplodeAttrs .andThen(ExplodeEdges) .andThen(RawToInteractiveTransform(params, start, end)) val outputGraph = transform(inputGraph) The Graph record has a definition field containing graph-global metadata, whereas entities holds the datasets keyed by their entity type. There are 3 graph modes currently: Raw, Interactive and BI. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the Layout dependent type, over which the entities are polymorphic.\nIt\u0026rsquo;s important to understand that Graph holds DataFrames, and these are lazily computed by Spark. So, Graph is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.\ncase class GraphDef[+M \u0026lt;: Mode]( isAttrExploded: Boolean, isEdgesExploded: Boolean, useTimestamp: Boolean, mode: M, entities: Map[EntityType, Option[String]] ) case class Graph[+M \u0026lt;: Mode]( definition: GraphDef[M], entities: Map[EntityType, M#Layout] ) sealed trait Mode { type Layout /* ... */ } object Mode { final case object Raw extends Mode { type Layout = DataFrame /* ... */ } final case class Interactive(bulkLoadPortion: Double) extends Mode { type Layout = DataFrame /* ... */ } final case class BI(bulkloadPortion: Double, batchPeriod: String) extends Mode { type Layout = BatchedEntity /* ... */ } } You may notice that Transform is statically typed w.r.t. Mode, however other properties, like isAttrExploded, or isEdgesExploded are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.\nAs we already mentioned, Graph is essentially a persistent container of EntityType -\u0026gt; DataFrame mappings. EntityType can be Node, Edge and Attr, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.\nUsually, a graph transformation involves matching entities based on their EntityType, and modifying the mapping (and if required, other metadata). Take, for example, the ExplodeAttrs transformation, which explodes into separate tables the values of two columns of Person stored as arrays:\nobject ExplodeAttrs extends Transform[Mode.Raw.type, Mode.Raw.type] { override def transform(input: In): Out = { if (input.definition.isAttrExploded) { // assert at runtime that the transformation hasn\u0026#39;t been applied yet throw new AssertionError(\u0026#34;Attributes already exploded in the input graph\u0026#34;) } def explodedAttr(attr: Attr, node: DataFrame, column: Column) = attr -\u0026gt; node.select(withRawColumns(attr, $\u0026#34;id\u0026#34;.as(s\u0026#34;${attr.parent}Id\u0026#34;), explode(split(column, \u0026#34;;\u0026#34;)).as(s\u0026#34;${attr.attribute}Id\u0026#34;))) val modifiedEntities = input.entities .collect { case (k @ Node(\u0026#34;Person\u0026#34;, false), df) =\u0026gt; // match the Person node. This is the only one ExplodeAttrs should modify Map( explodedAttr(Attr(\u0026#34;Email\u0026#34;, k, \u0026#34;EmailAddress\u0026#34;), df, $\u0026#34;email\u0026#34;), // add a new \u0026#34;PersonEmailEmailAddress\u0026#34; entity derived by exploding the email column of Person explodedAttr(Attr(\u0026#34;Speaks\u0026#34;, k, \u0026#34;Language\u0026#34;), df, $\u0026#34;language\u0026#34;), // add a new \u0026#34;PersonSpeaksLanguage\u0026#34; entity derived by exploding the language column of Person k -\u0026gt; df.drop(\u0026#34;email\u0026#34;, \u0026#34;language\u0026#34;) // drop the exploded columns from person ) } val updatedEntities = modifiedEntities .foldLeft(input.entities)(_ ++ _) // merge-replace the modified entities in the graph val updatedEntityDefinitions = modifiedEntities .foldLeft(input.definition.entities) { (e, v) =\u0026gt; e ++ v.map{ case (k, v) =\u0026gt; k -\u0026gt; Some(v.schema.toDDL) } // update the entity definition schema to reflect the modifications } val l = lens[In] // lenses provide a terse syntax for modifying nested fields (l.definition.isAttrExploded ~ l.definition.entities ~ l.entities).set(input)((true, updatedEntityDefinitions, updatedEntities)) } Note that EntityType does not hold the dataset\u0026rsquo;s full SQL schema currently, as it\u0026rsquo;s not useful for pattern matching, but can be accessed directly from DataFrame if needed.\nInput/output The Reader and Writer typeclasses are used to read from a Source and write to a Sink respectively, terminating a graph transformation pipeline\non both ends.\ntrait Reader[T] { type Ret def read(self: T): Ret def exists(self: T): Boolean } trait Writer[S] { type Data def write(self: Data, sink: S): Unit } There are implementations under ldbc.datagen.io.instances that read a graph from a GraphSource and write to a GraphSink.\nimport ldbc.snb.datagen.model import ldbc.snb.datagen.model.Mode import ldbc.snb.datagen.io.graphs.{GraphSource, GraphSink} import ldbc.snb.datagen.io.instances._ // read val inputPath = \u0026#34;path/to/input/graph\u0026#34; val inputFormat = \u0026#34;parquet\u0026#34; val source = GraphSource(model.graphs.Raw.graphDef, inputPath, inputFormat) val graph = Reader[GraphSource, Graph[Mode.Raw.type]].read(source) // transform val transform = ExplodeAttrs.andThen(ExplodeEdges) val transformedGraph = transform(graph) // write val outputPath = \u0026#34;path/to/output/graph\u0026#34; val outputFormat = \u0026#34;csv\u0026#34; val sink = GraphSink(outputPath, outputFormat) Writer[GraphSink, Graph[Mode.Raw.type]].write(transformedGraph, sink) We provide Ops syntax to make it shorter:\nimport ldbc.snb.datagen.model import ldbc.snb.datagen.model.Mode import ldbc.snb.datagen.io.graphs.{GraphSource, GraphSink} import ldbc.snb.datagen.io.instances._ import ldbc.snb.datagen.io.Reader.ops._ import ldbc.snb.datagen.io.Writer.ops._ // read val inputPath = \u0026#34;path/to/input/graph\u0026#34; val inputFormat = \u0026#34;parquet\u0026#34; val graph = GraphSource(model.graphs.Raw.graphDef, inputPath, inputFormat).read // transform val transformedGraph = ??? /* ... */ // write val outputPath = \u0026#34;path/to/output/graph\u0026#34; val outputFormat = \u0026#34;csv\u0026#34; transformedGraph.write(GraphSink(outputPath, outputFormat)) The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.\nSpark has a facility to derive SparkSQL schema from case classes automatically2. We created case classes for each entity in the Raw dataset. We also created a typeclass EntityTraits associating these classes with their EntityType, so we can summon them (and consequently their SparkSQL schema) in the reader.\nThe case classes are used during the serialization of the generated dataset too, but more about that later.\nFactor generation As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.\nWe decided it\u0026rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator\u0026rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use GraphX, which was unimaginable with the previous design.\nFactor tables are added by extending a map with a name -\u0026gt; Factor pair. Factor declares is input entities, and accepts a function that receives input DataFrames, and returns a single DataFrame as output.\nval factors = Map ( \u0026#34;personDisjointEmployerPairs\u0026#34; -\u0026gt; Factor(PersonType, PersonKnowsPersonType, OrganisationType, PersonWorkAtCompanyType) { case Seq(person, personKnowsPerson, organisation, workAt) =\u0026gt; val knows = undirectedKnows(personKnowsPerson) val company = organisation.where($\u0026#34;Type\u0026#34; === \u0026#34;Company\u0026#34;).cache() val personSample = person .orderBy($\u0026#34;id\u0026#34;) .limit(20) personSample .as(\u0026#34;Person2\u0026#34;) .join(knows.as(\u0026#34;knows\u0026#34;), $\u0026#34;knows.person2Id\u0026#34; === $\u0026#34;Person2.id\u0026#34;) .join(workAt.as(\u0026#34;workAt\u0026#34;), $\u0026#34;workAt.PersonId\u0026#34; === $\u0026#34;knows.Person1id\u0026#34;) .join(company.as(\u0026#34;Company\u0026#34;), $\u0026#34;Company.id\u0026#34; === $\u0026#34;workAt.CompanyId\u0026#34;) .select( $\u0026#34;Person2.id\u0026#34;.alias(\u0026#34;person2id\u0026#34;), $\u0026#34;Company.name\u0026#34;.alias(\u0026#34;companyName\u0026#34;), $\u0026#34;Company.id\u0026#34;.alias(\u0026#34;companyId\u0026#34;), $\u0026#34;Person2.creationDate\u0026#34;.alias(\u0026#34;person2creationDate\u0026#34;), $\u0026#34;Person2.deletionDate\u0026#34;.alias(\u0026#34;person2deletionDate\u0026#34;) ) .distinct() }, /* more factors */ ) As you can see, it\u0026rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. undirectedKnows). Currently, there\u0026rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.\nRevamping the data generator\u0026rsquo;s serializer At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator\u0026rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we\u0026rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.\nParquet is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.\nThe new serialization framework is heavily influenced by the design of Java OutputStreams, in the sense that stateful objects are composed to form a pipeline. For example, in case of activities, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).\nThe benefit of this architecture is that only the last component needs to change when we add support for a new output format.\nTo support Parquet, we made use of row-level serializers available in Hadoop\u0026rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the Raw entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. Forum) and Spark\u0026rsquo;s Encoder framework to encode the entities in Parquet, which means that the generated output remains consistent with DataFrame-based reader, and we spare a lot of code duplication.\nOptimizations After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks3 per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation4. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren\u0026rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.\n./tools/emr/submit_datagen_job.py sf3k_bi 3000 parquet bi \\ --sf-per-executor 3000 \\ --partitions 330 \\ --jar $JAR_NAME \\ --instance-type i3.4xlarge \\ --bucket $BUCKET_NAME \\ -- --explode-edges --explode-attrs ./tools/emr/submit_datagen_job.py sf10k_bi 10000 parquet bi \\ --sf-per-executor 3000 \\ --partitions 1000 \\ --jar $JAR_NAME \\ --instance-type i3.4xlarge \\ --bucket $BUCKET_NAME \\ -- --explode-edges --explode-attrs The above examples working configurations for generating the 3K and 10K BI datasets. The --sf-per-executor option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The --partitions option controls the total number of partitions, and was calculated based on the number of persons using the formula partitions = ceil(number_of_persons / block_size / 3) to get a maximum of 3 blocks per partition.\nConclusion These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.\nFootnotes The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nShameless plug: You can learn more on this from another blogpost of mine.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nThe datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nThe maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\n","permalink":"https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/","tags":["datagen","snb"],"title":"LDBC SNB Datagen – The winding path to SF100K"},{"categories":null,"contents":"Organizers: Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green\nLDBC is hosting a two-day hybrid workshop, co-located with SIGMOD 2022 on June 17-18 (Friday-Saturday).\nThe program consists of 10-15 minute talks followed by a Q\u0026amp;A session. The talks will be recorded and made available online.\nThe tenative program is the following. All times are in EDT.\nWe will have a social event on Friday at 17:30 at El Vez (Google Maps).\nFriday (Pennsylvania Convention Center, room 204B) start finish speaker title 09:20 09:30 Peter Boncz (LDBC/CWI) State of the union – slides, video 09:30 09:45 Alastair Green (LDBC/Birkbeck) LDBC\u0026rsquo;s fair use policies – slides, video 09:50 10:05 Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University) LDBC Social Network Benchmark: Business Intelligence workload v1.0 – slides, video 10:10 10:25 Heng Lin (Ant Group) LDBC Financial Benchmark introduction – slides, video 10:30 11:00 coffee break 11:00 11:15 Chen Zhang (CreateLink) New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – slides, video 11:20 11:35 James Clarkson (Neo4j) LDBC benchmarks: Promoting good science and industrial consumption – slides, video 11:40 11:55 Oskar van Rest (Oracle) Creating and querying property graphs in Oracle, on-premise and in the cloud – slides, video 12:00 12:15 Mingxi Wu (TigerGraph) Conquering LDBC SNB BI at SF-10k – slides, video 12:20 13:20 lunch (on your own) 13:20 13:35 Altan Birler (Technische Universität München) Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – slides, video 13:40 13:55 David Püroja (CWI) LDBC Social Network Benchmark: Interactive workload v2.0 – slides 14:00 14:15 Angela Bonifati (Lyon 1 University) The quest for schemas in graph databases – slides, video 14:20 14:35 Matteo Lissandrini (Aalborg University) Understanding graph data representations in triplestores – slides, video 14:40 14:55 Wim Martens (University of Bayreuth) Path representations – slides, video 15:00 15:20 Audrey Cheng\t(UC Berkeley) TAOBench: An end-to-end benchmark for social network workloads – slides, video Saturday (Philadelphia Marriott Downtown, room 401-402, 4th floor) start finish speaker title 10:00 10:15 Keith Hare (WG3) An update on the GQL \u0026amp; SQL/PGQ standards efforts – slides, video 10:20 10:35 Leonid Libkin (ENS Paris) Pattern matching in GQL and SQL/PGQ – slides, video 10:40 10:55 Petra Selmer (Neo4j/WG3) An overview of GQL – slides, video 11:00 11:15 Alastair Green (LDBC/WG3) GQL 2.0: A technical manifesto – slides, video 11:20 11:35 George Fletcher (TU Eindhoven) PG-Keys (LDBC Property Graph Schema Working Group) – slides, video 11:40 11:55 Arvind Shyamsundar (Microsoft) Graph capabilities in Microsoft SQL Server and Azure SQL Database – slides, video 12:00 13:30 lunch (on your own) 13:30 13:45 Daniël ten Wolde (CWI) Implementing SQL/PGQ in DuckDB – slides, video 13:50 14:05 Oszkár Semeráth, Kristóf Marussy (TU Budapest) Generation techniques for consistent, realistic, diverse, and scalable graphs – slides, video 14:10 14:25 Molham Aref (RelationalAI) Graph Normal Form – slides, video 14:30 14:45 Naomi Arnold (Queen Mary University of London) Temporal graph analysis of the far-right social network Gab – slides, video 14:50 15:05 Domagoj Vrgoč (PUC Chile) Evaluating path queries in MillenniumDB – slides, video 15:10 15:25 Pavel Klinov, Evren Sirin (Stardog) Stardog\u0026rsquo;s experience with LDBC – slides, video ","permalink":"https://ldbcouncil.org/event/fifteenth-tuc-meeting/","tags":["TUC Meeting"],"title":"Fifteenth TUC Meeting"},{"categories":null,"contents":"We are delighted to announce the set up of the Financial Benchmark (FinBench) task force.\nThe Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.\nCompared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.\nThe FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the Work Charter for FinBench\nIf you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.\n","permalink":"https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/","tags":["finbench"],"title":"Announcing the LDBC Financial Benchmark Task Force"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalspvldb-szarnyas-wssbwzb-22/","tags":[],"title":"The LDBC Social Network Benchmark: Business Intelligence Workload"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalspvldb-bonifati-dfhhmms-22/","tags":[],"title":"Threshold Queries in Theory and in the Wild"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/fin-bench-work-charter/","tags":[],"title":"Work Charter for FinBench v1.0"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-tr-tr-2021-01/","tags":[],"title":"Property graphs and paths in GQL: Mathematical definitions"},{"categories":null,"contents":"LDBC was hosting a one-day hybrid workshop, co-located with VLDB 2021 on August 16 (Monday) between 16:00–20:00 CEST.\nThe physical part of the workshop was held in room Akvariet 2 of the Tivoli Hotel (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC\u0026rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.\nTalks were scheduled to be 10 minutes with a short Q\u0026amp;A session. We had three sessions. Their schedules are shown below.\n[16:00–17:25 CEST] LDBC updates, benchmarks, query languages start speaker title 16:00 Peter Boncz (CWI) State of the union – slides 16:05 Gábor Szárnyas (CWI) Overview of LDBC benchmarks – slides 16:12 Mingxi Wu (TigerGraph) LDBC Social Network Benchmark results with TigerGraph – slides 16:24 Xiaowei Zhu (Ant Group) Financial Benchmark proposal – slides 16:36 Petra Selmer (Neo4j) Status report from the Existing Languages Working Group (ELWG) – slides, video 16:48 Jan Hidders (Birkbeck) Status report from the Property Graph Schema Working Group (PGSWG) – slides, video 17:00 Keith Hare (JCC Consulting) Database Language Standards Structure and Process, SQL/PGQ – slides, video 17:12 Stefan Plantikow (GQL Editor) Report on the GQL standard – slides, video coffee break (10 minutes)\n[17:35–18:45 CEST] Systems and data structures start speaker title 17:35 Vasileios Trigonakis (Oracle Labs) PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – slides, video 17:47 Matthias Hauck (SAP) JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – slides, video 17:59 Nikolay Yakovets (Eindhoven University of Technology) AvantGraph – slides, video 18:11 Semih Salihoglu (University of Waterloo) GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – slides, video 18:23 Semyon Grigorev (Saint Petersburg University) Context-free path querying: Obstacles on the way to adoption – slides, video 18:35 Per Fuchs (Technical University of Munich) Sortledton: A universal, transactional graph data structure – slides, video coffee break (10 minutes)\n[18:55-20:00 CEST] High-level approaches and benchmarks start speaker title 18:55 Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris) Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – slides, video 19:07 Vasia Kalavri (Boston University) Learning to partition unbounded graph streams – slides, video 19:19 Muhammad Attahir Jibril (TU Ilmenau) Towards a Hybrid OLTP-OLAP Graph Benchmark – slides, video 19:31 Riccardo Tommasini (University of Tartu) An outlook on Benchmarks for Graph Stream Processing – slides, video 19:43 Mohamed Ragab (University of Tartu) Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – slides, video ","permalink":"https://ldbcouncil.org/event/fourteenth-tuc-meeting/","tags":["TUC Meeting"],"title":"Fourteenth TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-2112-06217/","tags":[],"title":"Graph Pattern Matching in GQL and SQL/PGQ"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-angles-bdfhhlllm-21/","tags":[],"title":"PG-Keys: Keys for Property Graphs"},{"categories":null,"contents":"LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.\nLDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.\nThis TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.\nSNB Task Force Progress report ACID compliance test suite Integrating deletions to Datagen Migrating Datagen to Spark Redesign of BI read queries Extensions to the driver Ongoing work Datagen: tuning the distribution of deletes Interactive 2.0 workload BI 1.0 workload Zoom links will be sent through email.\n","permalink":"https://ldbcouncil.org/event/thirteenth-tuc-meeting/","tags":["TUC Meeting"],"title":"Thirteenth TUC Meeting"},{"categories":null,"contents":"LDBC\u0026rsquo;s Social Network Benchmark [4] (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems\u0026rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.\nLDBC SNB provides Datagen (Data Generator), which produces synthetic datasets, mimicking a social network\u0026rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).\nOverview The benchmark\u0026rsquo;s specification describes a social network data model which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see References.\nDatasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).\n\\ Figure 1. LDBC SNB Datagen Process on Hadoop\nIn the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.\nNote: The diagram shows the call sequence as implemented. All steps are sequential \u0026ndash; including the relationship generation \u0026ndash;, even in cases when the data dependencies would allow for parallelization.\nEntities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.1 A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this edge as property representation makes the data harder to handle in SQL than it would be with a flat join table.\nEntity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.2\nSerialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.\nMotivations for the migration The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:\nBetter memory utilization: MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.\nSmaller codebase: The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.\nSmall entry cost: Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.\nIncremental improvements: Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.\nOSS, commodity: Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.\nFirst steps The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.\nUse your memory: A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using MEMORY_AND_DISK). In short, the default caching strategy was used everywhere.\nRegression tests: Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.\nThread-safety concerns: Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn\u0026rsquo;t a concern in the original implementation due to the fact that MapReduce doesn\u0026rsquo;t use thread-based parallelization for mappers and reducers.3 In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, 4 which makes it somewhat harder to find potential unguarded shared variables.\nCase study: Person ranking Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the S3G2 paper [3].\nThe original MapReduce version \\ Figure 2. Diagram of the MapReduce code for ranking persons\nThe implementation, shown in pseudocode above, works as follows:\nThe equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low. The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate \u0026ldquo;side-channel\u0026rdquo; file upon the completion of a reduce task. In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person. Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.\nThe migrated version Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.\nBenchmarks Benchmarks were carried out on AWS EMR, originally utilising i3.xlarge instances because of their fast NVMe SSD storage and ample amount of RAM.\nThe application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.5 The MapReduce results were as follows:\nSF workers Platform Instance Type runtime (min) runtime * worker/SF (min) 10 1 MapReduce i3.xlarge 16 1.60 30 1 MapReduce i3.xlarge 34 1.13 100 3 MapReduce i3.xlarge 40 1.20 300 9 MapReduce i3.xlarge 44 1.32 It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.\nFigure 3. CPU Load for the Map Reduce cluster is bursty and less than\n50% on average (SF100, 2nd graph shows master)\nFigure 4. The job only starts to consume memory when already 10 minutes\ninto the run (SF100, 2nd graph shows master)\nLet\u0026rsquo;s see how Spark fares.\nSF workers Platform Instance Type runtime (min) runtime * worker/SF (min) 10 1 Spark i3.xlarge 10 1.00 30 1 Spark i3.xlarge 21 0.70 100 3 Spark i3.xlarge 27 0.81 300 9 Spark i3.xlarge 36 1.08 1000 30 Spark i3.xlarge 47 1.41 3000 90 Spark i3.xlarge 47 1.41 A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn\u0026rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.\nFigure 5. Full CPU utilization for Spark (SF100, last graph shows\nmaster)\nFigure 6. Spark eats up memory fast (SF100, 2nd graph shows master)\ni3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it only has a 300 GB SSD.\nSF workers Platform Instance Type runtime (min) runtime * worker/SF (min) 100 3 Spark r5d.2xlarge 16 0.48 300 9 Spark r5d.2xlarge 21 0.63 1000 30 Spark r5d.2xlarge 26 0.78 3000 90 Spark r5d.2xlarge 25 0.75 10000 303 Spark r5d.2xlarge 25 0.75 The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.\nNext steps The next improvement is refactoring the serializers so they use Spark\u0026rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.\nAs already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.\nThe Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for generating delete operations [1]. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.\nAcknowledgements This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!\nReferences [1] Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark\u0026rsquo;s Data Generator\n[2] 9th TUC Meeting \u0026ndash; LDBC SNB Datagen Update \u0026ndash; Arnau Prat (UPC) - slides\n[3] S3G2: a Scalable Structure-correlated Social Graph Generator\n[4] The LDBC Social Network Benchmark\n[5] LDBC - LDBC GitHub organization\nAlso makes it easier to map to a tabular format thus it is a SQL friendly representation.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nIt\u0026rsquo;s hard to imagine this done declaratively in SQL.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nInstead, multiple YARN containers have to be used if you want to parallelize on the same machine.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nAlthough editors usually render these using different font styles.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\nWith the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.\u0026#160;\u0026#x21a9;\u0026#xfe0e;\n","permalink":"https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/","tags":["datagen","snb"],"title":"Speeding Up LDBC SNB Datagen"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-oaep-oaep-2023-04/","tags":[],"title":"LDBC Property Graph Schema contributions to WG3"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-2010-12243/","tags":[],"title":"An analysis of the SIGMOD 2014 Programming Contest: Complex queries on the LDBC social network graph"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-waudby-sps-20/","tags":[],"title":"Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark's Data Generator"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-2011-15028/","tags":[],"title":"The LDBC Graphalytics Benchmark"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-2001-02299/","tags":[],"title":"The LDBC Social Network Benchmark"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conftpctc-waudby-skmbs-20/","tags":[],"title":"Towards Testing ACID Compliance in the LDBC Social Network Benchmark"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-oaep-oaep-2023-02/","tags":[],"title":"Introduction to GQL Schema design"},{"categories":null,"contents":"LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.\nLDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry \u0026ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.\nThis TUC meeting will be a one-day event on the last Friday of SIGMOD/PODS 2019 in Amsterdam, The Netherlands, in the conference venue of Beurs van Berlage. The room is the Mendes da Silva kamer. Please check its tips for accommodation in Amsterdam.\nNote also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called GRADES-NDA 2019, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).\nWe welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu to register.\n=\u0026gt; registration is free, but required \u0026lt;=\nYou need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.\nTalk proposals can be sent to Peter Boncz, who is also the local organizer. Please also send your slides to this email for archiving on this site.\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.\nAgenda In the TUC meeting, there will be:\nupdates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads. talks by data management practitioners highlighting graph data management challenges and products The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).\nThe TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (room: Mendes da Silva kamer):\n08:30-10:30 LDBC Board Meeting (non-public)\n10:30-11:00 Coffee\n11:00-12:45 Session 1: Graph Benchmarks\n11:00-11:05 Welcome \u0026amp; introduction\n11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap\n11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system\n12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations\n12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics\n12:45-14:00 Lunch\n14:00-16:05 Session 2: Graph Query Languages\n14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report\n14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features, report\n14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard\n15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph\u0026rsquo;s query language\n15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language\n16:05-16:30 Coffee\n16:30-17:50 Session 3: Graph System Performance\n16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark\n16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity pptx\n17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph\n17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data\nIf there is interest, we will organize a social dinner on Friday evening for LDBC attendees.\n","permalink":"https://ldbcouncil.org/event/twelfth-tuc-meeting/","tags":["TUC Meeting"],"title":"Twelfth TUC Meeting"},{"categories":null,"contents":"LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry \u0026ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.\nThis TUC meeting will be a one-day event preceding the SIGMOD/PODS 2018 conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called GRADES-NDA 2018 as well, so you might combine travel.\nWe welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu to register.\n=\u0026gt; registration is free, but required \u0026lt;=\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (boncz@cwi.nl) and Larri (larri@ac.upc.edu). Local organizer is Juan Sequeda (juanfederico@gmail.com).\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.\nAgenda In the TUC meeting there will be:\nupdates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads. talks by data management practitioners highlighting graph data management challenges and products The meeting will start on Friday morning, with a program from 10:30-17:00:\n10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting\n10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo\n11:00-11:30 coffee break\n11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing\n11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC\n12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI\n12:45-14:00 lunch\n14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark\n14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle\n14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service\n15:15-15:40 coffee break\n15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web\n16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph\n16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph\n16:55-17:20 Molham Aref (relational.ai)) - Introducing.. relational.ai\n18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701\nLocation The TUC will be held at the University of Texas at Austin, Department of Computer Science in the Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712 Room: GDC 6.302\nThe GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.\nFrom Austin to SIGMOD/PODS (Houston) on Saturday June 9 Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.\nBus One option is to take a MegaBus that departs from downtown Austin and arrives at downtown Houston.\nThere is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).\nIf you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).\n","permalink":"https://ldbcouncil.org/event/eleventh-tuc-meeting/","tags":["TUC Meeting"],"title":"Eleventh TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-oaep-oaep-2023-01/","tags":[],"title":"SQL/PGQ data model and graph schema"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confgrades-szarnyas-pampkeb-18/","tags":[],"title":"An early look at the LDBC Social Network Benchmark's Business Intelligence workload"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-angles-abbfglpps-18/","tags":[],"title":"G-CORE: A Core for Future Graph Query Languages"},{"categories":null,"contents":"This will be a one-day event at the VLDB 2017 conference in Munich, Germany on September 1, 2017.\nTopics and activities of interest in these TUC meetings are:\nPresentation on graph data management usage scenarios. Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force. Interaction with the new LDBC Board of Directors and the LDBC organisation officials. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at adiaz@ac.upc.edu to register; registration is free, but required.\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.\nAgenda In the TUC meeting there will be:\nupdates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads. talks by data management practitioners highlighting graph data management challenges selected scientific talks on graph data management technology The meeting will start on Friday morning, with a program from 10:30-17:00\n10:30-12:00: TUC session (public)\nPeter Boncz (CWI): GraphQL task force update - the G-CORE proposal (pptx) Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): LDBC Graphalytics v0.9, Graphalytics Global Competition and Graphalytics Custom Benchmark 12:00-13:30: lunch break\n13:30-15:00: TUC session (public)\nArnau Prat (UPC): Datasynth: Democratizing property graph generation Marcus Paradies (SAP): SAP HANA GraphScript Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform Gaétan Hains (Huawei): Cost semantics for graph queries 15:00-15:30: break\n15:30-17:00: TUC session (public)\nPetra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017 Markus Kaindl (Springer): SN SciGraph \u0026ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset Speakers should aim for a 20-minute talk.\nFurther:\non Friday evening (19:00-21:00) there will be a social dinner at Löwenbräukeller, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich). on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public. Venue The Technical University of Munich (TUM) is hosting that week the VLDB conference; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.\nThe TUC meeting will be held in in Room 2607 alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).\naddress: Technische Universität München (TUM), Arcisstraße 21, 80333 München\nGoogle Maps\n","permalink":"https://ldbcouncil.org/event/tenth-tuc-meeting/","tags":["TUC Meeting"],"title":"Tenth TUC Meeting"},{"categories":null,"contents":"LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.\nThis will be a two-day event at SAP Headquarters in Walldorf, Germany on February 9+10, 2017.\nThis will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:\nTwo day event with one day devoted to User\u0026rsquo;s experiences and one day devoted to benchmarking experiences. Presentation of the benchmarking results for the different benchmarks. Interaction with the new LDBC Board of Directors and the LDBC organisation officials. We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu;\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.\nAgenda In the TUC meeting there will be\nupdates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads. talks by data management practitioners highlighting graph data management challenges selected scientific talks on graph data management technology The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.\nThursday evening (19:00-21:00) there will be a social dinner in Heidelberg.\nFriday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.\nSocial Dinner Address: Hauptstraße 217, 69117 Heidelberg\nTime: 19:00 / 7pm\n(See attachments at the bottom of the page)\nThursday start time title – speaker 9:00 Welcome and logistics - Marcus Paradies (SAP) 9:10 Intro + state of the LDBC - Josep Lluis Larriba Pey (UPC) 9:20 LDBC Graph QL task force - Hannes Voigt (TU Dresden) 9:40 PGQL Status Update and Comparison to LDBC\u0026rsquo;s Graph QL proposals - Oskar van Rest (Oracle Labs) 10:00 Adding shortest-paths to MonetDB - Dean de Leo (CWI) 10:20 coffee 10:50 Evolving Cypher for processing multiple graphs - Stefan Plantikow (Neo Technology) 11:10 Standardizing Graph Database Functionality - An Invitation to Collaborate - Jan Michels (ISO/ANSI SQL, Oracle)\u0026quot; 11:30 Dgraph: Graph database for production environment - Tomasz Zdybal (Dgraph.io) 12:00 lunch 13:00 LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap - Alexandru Iosup (TU Delft) 13:20 LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft) 13:40 LDBC SNB Datagen Update - Arnau Prat (UPC) 14:00 LDBC SNB Business Intelligence Workload: Chokepoint Analysis - Arnau Prat (UPC) 14:20 LDBC Benchmark Cost Specification (+discussion) - Moritz Kaufmann (TU Munich) 14:40 coffee break 15:10 EYWA: the Distributed Graph Engine in Huawei MIND Platform (Yinglong Xia) 15:30 Graph Processing in SAP HANA - Marcus Paradies (SAP) 15:50 Distributed Graph Analytics with Gradoop - Martin Junghanns (Univ Leipzig) 16:10 Distributed graph flows: Cypher on Flink and Gradoop - Max Kießling (Neo Technology) 16:30 closing - Peter Boncz 17:30 end Friday start time title – speaker 9:00 welcome - Peter Boncz 9:20 Graph processing in obi4wan - Frank Smit (OBI4WAN) 9:40 Graph problems in the space domain - Albrecht Schmidt (ESA) 10:00 Medical Ontologies for Healthcare - Michael Neumann (SAP) 10:20 coffee 10:50 The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries - Gabor Szarnyas (BME) 11:10 Efficient sparse matrix computations and their generalization to graph computing applications - Albert-Jan Yzelman (Huawei) 11:30 Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge - Atanas Kyriakov (Ontotext) 12:00 lunch 13:00 LDBC Board of Directors Meeting 17:00 end Logistics Important things to know The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: link\nVenue The TUC meeting will be held in the SAP Headquarters at the SAP Guesthouse Kalipeh (https://www.kalipeh.com). The address is:\nWDF 44 / SAP Guesthouse Kalipeh\nDietmar-Hopp-Allee 15\n69190 Walldorf\nGermany\nMaps and situation Google Maps link\nGetting there By plane There are two airports close to SAP\u0026rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.\nWhen booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.\nThe journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).\nJourney time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).\nDriving directions Traveling from Frankfurt Airport (FRA) to SAP Headquarters:\nDirections to SAP headquarters:\nWhen leaving the airport, follow the highway symbol onto \u0026ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.\u0026rdquo; Follow the A5 to \u0026ldquo;Basel/Karlsruhe/Heidelberg.\u0026rdquo; Take exit 39 \u0026ndash; \u0026ldquo;Walldorf/Wiesloch.\u0026rdquo; Turn left onto B291. Turn right onto Dietmar-Hopp-Allee. (Should you use a navigational system which does not recognize the street name \u0026lsquo;Dietmar-Hopp-Allee\u0026rsquo; please use \u0026lsquo;Neurottstrasse\u0026rsquo; instead.)\nTraveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:\nTo get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.\nDirections to SAP headquarters:\nWhen leaving the airport, follow the highway symbol onto \u0026ldquo;A8/Stuttgart/B27.\u0026rdquo; Stay on A8 and follow the sign for \u0026ldquo;Karlsruhe/Heilbronn/Singen/A8.\u0026rdquo; Follow A8 to Karlsruhe. Take exit 41 \u0026ndash; \u0026ldquo;Dreieck Karlsruhe\u0026rdquo; to merge onto A5 toward \u0026ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).\u0026rdquo; Take exit 39 \u0026ndash; \u0026ldquo;Walldorf/Wiesloch.\u0026rdquo; Turn left onto B291. Turn right onto Dietmar-Hopp-Allee. Parking The closest parking lot to the event location is P7 (see figure above).\nBy Train As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.\nFrom Frankfurt Airport (FRA) to SAP Headquarters\nDirections to SAP headquarters:\nGo to Terminal 1, level T (see overview in Appendix). Go to the AIRail Terminal \u0026ndash; \u0026ldquo;Fernbahnhof\u0026rdquo; (long-distance trains). Choose a connection with the destination train station \u0026ldquo;Wiesloch\u0026ndash;Walldorf\u0026rdquo;. From station \u0026ldquo;Wiesloch\u0026ndash;Walldorf,\u0026rdquo; take bus number 707 or 721 toward \u0026ldquo;Industriegebiet Walldorf, SAP.\u0026rdquo; It is a 10-minute ride to reach bus stop \u0026lsquo;SAP headquarters\u0026rsquo;. From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters\nDirections to SAP headquarters:\nGo to the S-Bahn station in the airport, following the sign (station is called \u0026ldquo;Stuttgart Flughafen/Messe\u0026rdquo;). Take train number S2 or S3 to \u0026ldquo;Stuttgart Hauptbahnhof\u0026rdquo; (main station). From Stuttgart Hauptbahnhof choose a connection with the destination train station \u0026ldquo;Wiesloch\u0026ndash;Walldorf\u0026rdquo;. From station \u0026ldquo;Wiesloch\u0026ndash;Walldorf,\u0026rdquo; take bus number 707 or 721 toward \u0026ldquo;Industriegebiet Walldorf, SAP\u0026rdquo;. It is a 10-minute ride to reach bus stop \u0026lsquo;SAP headquarters\u0026rsquo;. ","permalink":"https://ldbcouncil.org/event/ninth-tuc-meeting/","tags":["TUC Meeting"],"title":"Ninth TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confgrades-leo-b-17/","tags":[],"title":"Extending SQL for Computing Shortest Paths"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confgrades-ngai-hhi-17/","tags":[],"title":"Granula: Toward Fine-grained Performance Analysis of Large-scale Graph Processing Platforms"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confgrades-prat-perez-gskdb-17/","tags":[],"title":"Towards a property graph generator for benchmarking"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-oaep-oaep-2023-03/","tags":[],"title":"Cypher schema constraints proposal"},{"categories":null,"contents":"LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.\nLDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.\nTim Hegeman of TU Delft is today presenting the technical paper describing LDBC Graphalytics at the important VLDB (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.\nLDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.\nLearn more: [/ldbc-graphalytics](LDBC Graphalytics)\nGitHub: https://github.com/tudelft-atlarge/graphalytics\n","permalink":"https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/","tags":["benchmark","tu delft","graphalytics"],"title":"LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification"},{"categories":null,"contents":"The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.\nThis will be a two-day event/eighth-tuc-meeting/attachments at Oracle Conference Center in Redwood Shores facility on Wednesday and Thursday June 22-23, 2016.\nThis will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:\nTwo day event/eighth-tuc-meeting/attachments with one day devoted to User\u0026rsquo;s experiences and one day devoted to benchmarking experiences. Presentation of the benchmarking results for the different benchmarks. Interaction with the new LDBC Board of Directors and the LDBC organisation officials. We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu; in order to notify Oracle security in advance, registration requests need to be in by June 12.\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.\nIn this page, you\u0026rsquo;ll find information about the following items:\nAgenda Logistics Date Venue Getting there Accommodation Agenda On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.\nWednesday, 22th of June 2016 (Room 203) (full morning: LDBC Board of Directors meeting)\n12:00 - 13:00 Lunch (provided) 13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome. 13:30 - 14:00 Peter Boncz (CWI) LDBC introduction and status update. 14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey) 14:00 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload. 14:30 Tim Hegeman (TU Delft). Social Network Benchmark, Analytics workload. 15:00 - 15:30 Coffee break 15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) 15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). Graphing Healthcare Networks: Data, Analytics, and Use Cases. 16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). Frappé: Querying and managing evolving code dependency graphs. 16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). UniProt: challenges of a public SPARQL endpoint. 17:00 - 18:30 Graph Technologies (chair Peter Boncz) 17:00 Eugene I. Chong (Oracle USA). Balancing Act to improve RDF Query Performance in Oracle Database. 17:30 Lijun Chang (University of New South Wales). Efficient Subgraph Matching by Postponing Cartesian Products. 18:00 Weining Qian (East China Normal University). On Statistical Characteristics of Real-Life Knowledge Graphs. Thursday, 23th of June 2016 (Room 203) 08:00 - 09:00 Breakfast (provided) 09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) 09:00 Peter Boncz (CWI). Query Language Task Force status 09:45 Marcus Paradies (SAP). Social Network Benchmark, Business Intelligence workload 10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) 10:00 Sergey Edunov (Facebook). Generating realistic trillion-edge graphs 10:30 George Fletcher (TU Eindhoven). An open source framework for schema-driven graph instance and graph query workload generation. 11:00 Yinglong Xia (Huawei Research America): An Efficient Big Graph Analytics Platform. 11:30 Zhe Wu (Oracle USA). Bridging RDF Graph and Property Graph Data Models 12:00 - 13:30 Lunch (provided) 13:30 - 15:30 Graph Technologies (chair Arnau Prat) 13:30 Tobias Lindaaker (Neo Technology). An open standard for graph queries: the Cypher contribution 14:00 Arash Termehchy (Oregon State University). Toward Representation Independent Graph Querying \u0026amp; Analytics 14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). In the service of the federation 15:00 Nandish Jayaram (Pivotal). Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs. 15:30 - 16:00 Coffee break 16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) 16:00 Jans Aasman (Franz Inc.). Semantic Data Lake for Healthcare 16:15 Kevin Madden (Tom Sawyer Software). Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis 16:45 Juan Sequeda (Capsenta). Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources 17:15 Kevin Wilkinson (Hewlett Packard Labs). LDBC SNB extensions 17:45 - 18:15 Closing discussion Friday, 24th of June 2016 (Room 105) At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (GRADES16).\n18:30 social dinner for GRADES registrants (place to be announced)\nLogistics Date 22nd and 23rd June 2016\nVenue The TUC meeting will be held in the Oracle Conference Center\nThe address is:\nRoom 203 (Wed-Thu) \u0026amp; Room 105 (Fri)\nOracle Conference Center\n350 Oracle Parkway\nRedwood City, CA 94065, USA\nMaps and situation\nGoogle Maps link\nOracle Campus map:\nGetting there Driving directions [Southbound] - Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right. [Northbound] - Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right. Parking The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.\nPublic transport Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.\nCaltrain timetables: http://www.caltrain.com/schedules/weekdaytimetable.html Oracle Shuttle timetables: http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.\nAlternatively, SamTrans (San Mateo County\u0026rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.\n","permalink":"https://ldbcouncil.org/event/eighth-tuc-meeting/","tags":["TUC Meeting"],"title":"Eighth TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsemweb-kotsev-mpefk-16/","tags":[],"title":"Benchmarking RDF Query Engines: The LDBC Semantic Publishing Benchmark"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalspvldb-iosup-hnhpmccsat-16/","tags":[],"title":"LDBC Graphalytics: A Benchmark for Large-Scale Graph Analysis on Parallel and Distributed Platforms"},{"categories":null,"contents":"Apache Flink [1] is an open source platform for distributed stream and batch data processing. Flink\u0026rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.\nFlink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.\nThe following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:\nDataSet\u0026lt;String\u0026gt; text = env.fromElements( \u0026#34;He who controls the past controls the future.\u0026#34;, \u0026#34;He who controls the present controls the past.\u0026#34;); DataSet\u0026lt;Tuple2\u0026lt;String, Integer\u0026gt;\u0026gt; wordCounts = text .flatMap(new LineSplitter()) // splits the line and outputs (word,1) tuples.groupBy(0) // group by word .sum(1); // sum the 1\u0026#39;s wordCounts.print(); At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop [2]. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing [3]. Using the class LDBCToFlink, LDBC output files can be read directly from HDFS or from the local file system:\nfinal ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); final LDBCToFlink ldbcToFlink = new LDBCToFlink( \u0026#34;hdfs:///ldbc_snb_datagen/social_network\u0026#34;, // or \u0026#34;/path/to/social_network\u0026#34; env); DataSet\u0026lt;LDBCVertex\u0026gt; vertices = ldbcToFlink.getVertices(); DataSet\u0026lt;LDBCEdge\u0026gt; edges = ldbcToFlink.getEdges(); The tuple classes LDBCVertex and LDBCEdge hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.\nEach LDBCVertex instance contains:\nan identifier, which is unique among all vertices * a vertex label (e.g. Person, Comment) * a key-value map of properties including also multivalued properties\n(e.g. Person.email) Each LDBCEdge instance contains:\nan identifier, which is unique among all edges an edge label (e.g. knows, likes) a source vertex identifier a target vertex identifier a key-value map of properties The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label Person and edges with the label knows and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub [4].\nfinal ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); final LDBCToFlink ldbcToFlink = new LDBCToFlink( \u0026#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network\u0026#34;, env); // filter vertices with label “Person” DataSet\u0026lt;LDBCVertex\u0026gt; ldbcVertices = ldbcToFlink.getVertices() .filter(new VertexLabelFilter(LDBCConstants.VERTEX_CLASS_PERSON)); // filter edges with label “knows” DataSet\u0026lt;LDBCEdge\u0026gt; ldbcEdges = ldbcToFlink.getEdges() .filter(new EdgeLabelFilter(LDBCConstants.EDGE_CLASS_KNOWS)); // create Gelly vertices suitable for connected components DataSet\u0026lt;Vertex\u0026lt;Long, Long\u0026gt;\u0026gt; vertices = ldbcVertices.map(new VertexInitializer()); // create Gelly edges suitable for connected components DataSet\u0026lt;Edge\u0026lt;Long, NullValue\u0026gt;\u0026gt; edges = ldbcEdges.map(new EdgeInitializer()); // create Gelly graph Graph\u0026lt;Long, Long, NullValue\u0026gt; g = Graph.fromDataSet(vertices, edges, env); // run connected components on the subgraph for 10 iterations DataSet\u0026lt;Vertex\u0026lt;Long, Long\u0026gt;\u0026gt; components = g.run(new ConnectedComponents\u0026lt;Long, NullValue\u0026gt;(10)); // print the component id of the first 10 vertices components.first(10).print(); The ldbc-flink-import tool is available on Github [3] and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.\nIf you want to learn more about Apache Flink, a good starting point is the main documentation [5] and if you have any question feel free to ask the official mailing lists.\nThere is also a nice set of videos [6] available from the latest Flink Forward conference.\nReferences [1] http://flink.apache.org/\n[2] https://github.com/dbs-leipzig/gradoop\n[3] https://github.com/s1ck/ldbc-flink-import\n[4] https://gist.github.com/s1ck/b33e6a4874c15c35cd16\n[5] https://ci.apache.org/projects/flink/flink-docs-release-0.10/\n[6] https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA\n","permalink":"https://ldbcouncil.org/post/ldbc-and-apache-flink/","tags":["flink","datagen","snb"],"title":"LDBC and Apache Flink"},{"categories":null,"contents":"The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.\nThis will be a two-day event at IBM\u0026rsquo;s TJ Watson facility on Monday and Tuesday November 9/10, 2015.\nThis will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:\nTwo day event with one day devoted to User\u0026rsquo;s experiences and one day devoted to benchmarking experiences. Presentation of the benchmarking results for the different benchmarks. Interaction with the new LDBC Board of Directors and the LDBC organisation officials. We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at damaris@ac.upc.edu; in order to notify IBM security in advance, registration requests need to be in by Nov 1.\nIn the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.\nFurther, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.\nIn this page, you\u0026rsquo;ll find information about the following items:\nAgenda Logistics\n- Date\n- Venue\n- Maps and situation\n- Getting there Agenda Monday, 9th of November 2015\n8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)\n9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)\n9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)\n9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload\n10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload\n10:30-11:00 Coffee break\n11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)\n11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.\n11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.\n12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status\n12:30 - 14:00 Lunch break\n14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)\n14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox\n14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today\u0026rsquo;s architectures\n15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G\n15:30-16:00 Coffee break\n16:00 - 17:00 Technologies (chair Irini Fundulaki)\n16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store\n16:30 David Ediger (GeorgiaTech). STINGER\n17:00 Gary King (Franz Inc.). AllegroGraph\u0026rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties\n17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics\n18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase\n19:00 Social dinner\nTuesday 10th November 2015\n9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)\n9:00 Philip Rathle (Neo). On openCypher\n9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification\n9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions\n10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation\n10:30 - 11:00 Coffee break\n11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)\n11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL\n11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,\n11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis\n12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives\n12:30 - 14:00 Lunch break\n14:00 LDBC Board of Directors\nLogistics Date 9th and 10th November 2015\nVenue The TUC meeting will be held in the IBM Thomas J Watson Research Center.\nThe address is:\nIBM Thomas J Watson Research Center\n1101 Kitchawan Rd,\nYorktown Heights, NY 10598, USA\nIf you are using a GPS system, please enter \u0026ldquo;200 Aqueduct Road, Ossining NY, 10562\u0026rdquo; for accurate directions to the lab entrance. You may also want to check the routing online.\nThe meeting will take place in the Auditorium on November 9th, and in Meeting Room 20-043 on November 10th.\nMaps and situation You are highly suggested to rent a car for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through www.orbitz.com, or www.expedia.com Feel free to email yxia@us.ibm.com for any questions.\nGetting there Upper and Eastern New England\nRoute I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.\nNew Haven and Connecticut Shores\nMerritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.\nNew Jersey\nTake New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.\nUpstate New York\nRoute I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.\nNew York City (Manhattan)\nHenry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.\nJohn F. Kennedy International Airport\nNorth on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.\nLaGuardia Airport\nEast on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.\nNewark International Airport\nNorth on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.\nStewart International Airport\nRoute 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.\nWestchester County Airport\nRight on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.\nPublic Transportation\nMetropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.\n","permalink":"https://ldbcouncil.org/event/seventh-tuc-meeting/","tags":["TUC Meeting"],"title":"Seventh TUC Meeting"},{"categories":null,"contents":"The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using instance matching techniques and tools. Instance matching is also known as record linkage [1], duplicate detection [2], entity resolution [3] and object identification [4].\nFor instance, a search in Geonames (http://www.geonames.org/) for \u0026ldquo;Athens\u0026rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (http://dbpedia.org/) or Open Government Datasets (http://data.gov.gr/). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.\nWeb resources are published by \u0026ldquo;autonomous agents\u0026rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years [2][5].\nIt is though essential at this point to develop, along with instance and entity matching systems, instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.\nAn instance matching benchmark for Linked Data consists of a source and target dataset implementing a set of test-cases, where each test case addresses a different kind of requirement regarding instance matching, a ground truth or gold standard and finally the evaluation metrics used to assess the benchmark.\nDatasets are the raw material of a benchmark. A benchmark comprises of a source and target dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their nature (real or synthetic), (b) the schemas/ontologies they use, (c) their domains, (d) the languages they are written in, and (e) the variations/heterogeneities of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. Synthetic datasets are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner\nDatasets (and benchmarks) may contain different kinds of variations that correspond to different test cases. According to Ferrara et.al. [6][7], three kinds of variations exist for Linked Data, namely data variations, structural variations and logical variations. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.\nThe common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.\nThe gold standard is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.\nLast, an instance matching benchmark uses evaluation metrics to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard precision, recall and f-measure metrics.\nReferences [1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.\n[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).\n[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.\n[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.\n[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.\n[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).\n[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.\n","permalink":"https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/","tags":["instance matching","spb"],"title":"Elements of Instance Matching Benchmarks: a Short Overview"},{"categories":null,"contents":"In this post we will look at running the LDBC SNB on Virtuoso.\nFirst, let\u0026rsquo;s recap what the benchmark is about:\nfairly frequent short updates, with no update contention worth mentioning\nshort random lookups\nmedium complex queries centered around a person\u0026rsquo;s social environment\nThe updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.\nThe DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, per se, since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.\nThe workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.\nThe test system is the same as used in the TPC-H series: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the feature/analytics branch of v7fasttrack, available from www.github.com.\nThe dataset is the SNB 300G set, with:\n1,136,127 persons 125,249,604 knows edges 847,886,644 posts, including replies 1,145,893,841 tags of posts or replies 1,140,226,235 likes of posts or replies As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.\nBelow are the numerical quantities for a 400K operation run after 150K operations worth of warmup.\nDuration: 10:41.251\nThroughput: 623.71 (op/s)\nThe statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.\n% of total total_wait name count mean min max 20% 4,231,130 LdbcQuery5 656 6,449.89 245 10,311 11% 2,272,954 LdbcQuery8 18,354 123.84 14 2,240 10% 2,200,718 LdbcQuery3 388 5,671.95 468 17,368 7.3% 1,561,382 LdbcQuery14 1,124 1,389.13 4 5,724 6.7% 1,441,575 LdbcQuery12 1,252 1,151.42 15 3,273 6.5% 1,396,932 LdbcQuery10 1,252 1,115.76 13 4,743 5% 1,064,457 LdbcShortQuery3PersonFriends 46,285 22.9979 0 2,287 4.9% 1,047,536 LdbcShortQuery2PersonPosts 46,285 22.6323 0 2,156 4.1% 885,102 LdbcQuery6 1,721 514.295 8 5,227 3.3% 707,901 LdbcQuery1 2,117 334.389 28 3,467 2.4% 521,738 LdbcQuery4 1,530 341.005 49 2,774 2.1% 440,197 LdbcShortQuery4MessageContent 46,302 9.50708 0 2,015 1.9% 407,450 LdbcUpdate5AddForumMembership 14,338 28.4175 0 2,008 1.9% 405,243 LdbcShortQuery7MessageReplies 46,302 8.75217 0 2,112 1.9% 404,002 LdbcShortQuery6MessageForum 46,302 8.72537 0 1,968 1.8% 387,044 LdbcUpdate3AddCommentLike 12,659 30.5746 0 2,060 1.7% 361,290 LdbcShortQuery1PersonProfile 46,285 7.80577 0 2,015 1.6% 334,409 LdbcShortQuery5MessageCreator 46,302 7.22234 0 2,055 1% 220,740 LdbcQuery2 1,488 148.347 2 2,504 0.96% 205,910 LdbcQuery7 1,721 119.646 11 2,295 0.93% 198,971 LdbcUpdate2AddPostLike 5,974 33.3062 0 1,987 0.88% 189,871 LdbcQuery11 2,294 82.7685 4 2,219 0.85% 182,964 LdbcQuery13 2,898 63.1346 1 2,201 0.74% 158,188 LdbcQuery9 78 2,028.05 1,108 4,183 0.67% 143,457 LdbcUpdate7AddComment 3,986 35.9902 1 1,912 0.26% 54,947 LdbcUpdate8AddFriendship 571 96.2294 1 988 0.2% 43,451 LdbcUpdate6AddPost 1,386 31.3499 1 2,060 0.01% 1,848 LdbcUpdate4AddForum 103 17.9417 1 65 0.00% 44 LdbcUpdate1AddPerson 2 22 10 34 At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.\nThe implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.\nThe set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:\nCardinality estimation under heavy data skew — Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.\nCovering indices — Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post\u0026rsquo;s creation date.\nMulti-hop graph traversal — Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.\nTop K — Most queries returning posts order results by descending date. Once there are at least k results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top k.\nLate projection — Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.\nMaterialization — Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.\nConcurrency control — Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.\nIn subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.\nSNB Interactive Series SNB Interactive, Part 1: What is SNB Interactive Really About? SNB Interactive, Part 2: Modeling Choices SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso ","permalink":"https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/","tags":["snb","interactive"],"title":"SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso"},{"categories":null,"contents":"Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.\nAmong the papers published in this edition we have \u0026ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms\u0026rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in https://github.com/ldbc) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have \u0026ldquo;Microblogging Queries on Graph Databases: an Introspection\u0026rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention \u0026ldquo;Frappé: Querying the Linux Kernel Dependency Graph\u0026rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.\nCheck the complete agenda.\nMeet you in Melbourne!\n","permalink":"https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/","tags":["sigmod","graphalytics","grades","snb","datagen","workshop"],"title":"SNB and Graphs Related Presentations at GRADES '15"},{"categories":null,"contents":"​SNB Interactive is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.\nIn the case of Virtuoso, we have played with SQL and SPARQL implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.\nSchema Design SNB has a regular schema described by a UML diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.\nThe only table-level choice has to do with whether posts and comments are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.\nThe workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key of ps_creatorid, ps_creationdate, ps_postid pays off since the top-k on creationdate can be pushed down into the index without needing a reference to the table.\nThe implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the Sparksee and Neo4J implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.\nThe benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 \u0026ldquo;top of the wall\u0026rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. person1, person2 -\u0026gt; weight. Person1 is by convention the one with the smaller p_personid. Note that comparing id\u0026rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI\u0026rsquo;s with disastrous performance implications unless an implementation specific trick were used.\nIn the next installment we will analyze an actual run.\nSNB Interactive Series SNB Interactive, Part 1: What is SNB Interactive Really About? SNB Interactive, Part 2: Modeling Choices SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso ","permalink":"https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/","tags":["snb","virtuoso","interactive"],"title":"SNB Interactive Part 2: Modeling Choices"},{"categories":null,"contents":"LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.\nOn the industry track, LDBC will be presenting the Social Network Benchmark Interactive Workload by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).\nYou can read more about the Social Network Benchmark here and collaborate if you\u0026rsquo;re interested!\nThe other presentation will be at the GRADES workshop within the SIGMOD program regarding Graphalytics: A Big Data Benchmark for Graph-Processing platforms by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.\nDon\u0026rsquo;t forget to check our presentations if you\u0026rsquo;re attending the SIGMOD!\n","permalink":"https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/","tags":["sigmod","grades","snb","graphalytics","workshop"],"title":"LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference"},{"categories":null,"contents":"This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.\nWith two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.\nThe essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.\nSo far, we see that SNB confronts the implementor with choices in the following areas:\nData model: Relational, RF, property graph? Physical model, e.g. row-wise vs. column wise storage Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc. Parameters vs. literals: Sometimes different parameter values result in different optimal query plans Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing. In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.\nData generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option. Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce? Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a \u0026lsquo;fast\u0026rsquo; and \u0026lsquo;slow\u0026rsquo; case of a single query template. How long does one need to run to balance these fluctuations? Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput. Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse. Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable. Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy? Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters? The following posts will look at the above in light of actual experience.\nSNB Interactive Series SNB Interactive, Part 1: What is SNB Interactive Really About? SNB Interactive, Part 2: Modeling Choices SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso ","permalink":"https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/","tags":["snb","virtuoso","interactive"],"title":"SNB Interactive Part 1: What Is SNB Interactive Really About?"},{"categories":null,"contents":"In a previous 3-part blog series we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn\u0026rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - \u0026ldquo;graph\u0026rdquo; is the word that best captures the novelty and difficulty of this work.\nWorkload Execution - Traditional vs Graph\nTransactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.\nTo understand what is meant by \u0026ldquo;traditional relational workloads\u0026rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, \u0026ldquo;dependency\u0026rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator\u0026rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order for a given user. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.\nA significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user\u0026rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.\nSuch scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend\u0026rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.\nBecause it\u0026rsquo;s a graph\nIn short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.\n","permalink":"https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/","tags":["snb","driver","interactive"],"title":"Why Do We Need an LDBC SNB-Specific Workload Driver?"},{"categories":null,"contents":"As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.\nFirst of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:\nThe wall of the person\nThe albums of the person\nThe groups where the person is a moderator\nWe will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.\nAfter assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.\nFlashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to \u0026ldquo;Enrique Iglesias\u0026rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.\nOnce the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:\nDetermine the number of posts to generate\nSelect a random member of the group that will generate the post\nDetermine the event the post will be related to given the aforementioned cumulative distribution\nAssign the date of the post based on the event date\nIn order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from [1]. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.\nFollowing the example of \u0026ldquo;Enrique Iglesias\u0026rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.\nIn this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.\nReferences [1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506\n","permalink":"https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/","tags":["datagen","social network","snb"],"title":"Event Driven Post Generation in Datagen"},{"categories":null,"contents":"The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.\nThis will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on Thursday and Friday March 19/20, 2015.\nThe LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:\nTwo day event with one day devoted to User\u0026rsquo;s experiences and one day devoted to benchmarking experiences. Presentation of the first benchmarking results for the different benchmarks. Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials. Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations. We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact damaris@ac.upc.edu.\nAgenda Thursday 19th March\n11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)\n11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – slides\n12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)\n12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – slides\n12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain\n12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive\n13:10 Claudio Martella (VUA): Giraph and Lighthouse\n13:30 - 14:30 Lunch break\n14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)\n14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production slides\n14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services slides\n15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph slides\n15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs\n18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.\n20:00 Social dinner at Bastaix Restaurant.\nFriday 20th March\n9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)\n9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics\n9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms\n10:10 John Snelson (MarkLogic): Introduction to MarkLogic\n10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload\n10:50 Moritz Kaufmann. The auditing experience\n11:15 - 11:45 Coffee break\n11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)\n11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox\n12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data\n12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments\n12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. slides\n13:30 - 14:30 Lunch break\n15:00 LDBC Board of Directors\nLogistics Date 19th and 20th March 2015\nVenue The TUC meeting will be held at \u0026ldquo;Aula Master\u0026rdquo; at A3 building located inside the \u0026ldquo;Campus Nord UPC\u0026rdquo; in Barcelona. The address is:\nAula Master\nEdifici A3, Campus Nord UPC\nC. Jordi Girona, 1-3\n08034 Barcelona, Spain\nMaps and situation To reach the campus, there are several options, including Taxi, Metro and Bus.\nFinding UPC Finding the meeting room Getting there Flying: Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this map of the airport). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.\nRail: The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to\nthe centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.\nBus: The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.\nTaxi: From the airport, you can take one of Barcelona\u0026rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.\nTrain and bus: Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: http://www.barcelona-airport.com/eng/transport_eng.htm\nThe locations of the airport and the city centre ","permalink":"https://ldbcouncil.org/event/sixth-tuc-meeting/","tags":["TUC Meeting"],"title":"Sixth TUC Meeting"},{"categories":null,"contents":"This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.\nWhen generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.\nThe first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 [1]. Here we summarize the paper and its contributions and findings.\nExisting synthetic graph generators such as Rmat [1] and Mag [2], are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR [3], a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.\nHowever, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of [1] set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to [4].\nNodes Edges Amazon 334863 925872 Dblp 317080 1049866 Youtube 1134890 2987624 Livejournal 3997962 34681189 The authors of [1] selected a set of statistical indicators to\ncharacterize the communities:\nThe clustering coefficient The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community. The bridge ratio, which is the ratio of edges whose removal disconnects the community. The diameter The conductance The size The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.\nMany of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community. Most of the communities are small (10 or less nodes). In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity. Clustering Coefficient TPR Bridge Ratio Diameter Conductance Size The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.\nClustering Coefficient TPR Bridge Ratio TPRDiameter Conductance Size The main conclusions that can be extracted from DATAGEN can be summarized asfollows:\nDATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio. The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs. Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger. The diameter is small like in the real graphs. It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics. Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:\nLFR graphs donot show the multimodal distribution observed in real graphs Only the diameter shows a similar shape as in the ground truth. Clustering Coefficient TPR Bridge Ratio TPRDiameter Conductance Size To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman\u0026rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.\nClustering Coefficient TPR Bridge Ratio TPRDiameter Conductance Size We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!\nReferences [1] Arnau Prat-Pérez, David Domínguez-Sal: How community-like is the structure of synthetically generated graphs? GRADES 2014\n[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014\n[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics\n[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.\n","permalink":"https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/","tags":["datagen","social network","snb"],"title":"The LDBC Datagen Community Structure"},{"categories":null,"contents":"Publishing and media businesses are going through transformation I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.\nWhat’s driving this change in publishing and media?\nWidespread and instantaneous distribution of information over the Internet has turned news into somewhat of a \u0026ldquo;commodity\u0026rdquo; and few people are willing to pay for it\nThe wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;\nOpen access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.\nAlongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.\nImagine instant news in context, Imagine personal channels, Imagine \u0026hellip; triplestores While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in \u0026ldquo;near real time\u0026rdquo; is difficult and expensive using legacy processes and content management technology.\nAnother example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like Factiva, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.\nFinally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.\nMany publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:\nGeneration of rich and \u0026ldquo;meaningful\u0026rdquo; (trying not to use \u0026ldquo;semantic\u0026rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery. In this post I write about \u0026ldquo;semantic annotation\u0026rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.\nSemantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing The most popular meaning of \u0026ldquo;semantic annotation\u0026rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.\nThe concept of using text-mining for automatic semantic annotation of text with respect to very large datasets, such as DBPedia, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether \u0026ldquo;Paris\u0026rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether \u0026ldquo;Hilton\u0026rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.\nToday there are plenty of tools (such as the Ontotext Media and Publishing platform and DBPedia Spotlight) and services (such as Thomson Reuter’s OpenCalais and Ontotext’s S4) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.\nThe BBC’s Dynamic Semantic Publishing (DSP) Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.\nBBC Future Media \u0026amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.\n\u0026ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories\u0026rdquo;. \u0026ndash; Jem Rayfield, Senior Technical Architect, BBC News and Knowledge\nThe Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (\u0026ldquo;edited by exception\u0026rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.\n\u0026ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A natural language and ontological determiner process automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.\nJournalist-published metadata is captured and made persistent for querying using the resource description framework (RDF) metadata representation and triple store technology. A RDF triplestore and SPARQL approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept \u0026ldquo;Frank Lampard\u0026rdquo;, then the framework infers and applies concepts such as \u0026ldquo;England Squad\u0026rdquo;, \u0026ldquo;Group C\u0026rdquo; and \u0026ldquo;FIFA World Cup 2010\u0026rdquo; \u0026hellip;\u0026rdquo; \u0026ndash; Jem Rayfield\nOne can consider each of the \u0026ldquo;aggregation pages\u0026rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.\n\u0026ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content\n…we are not publishing pages, but publishing content as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.\n… The index pages are published automatically. This process is what assures us of the highest quality output, but still save large amounts of time in managing the site and makes it possible for us to efficiently run so many pages for the World Cup.\u0026rdquo; \u0026ndash; John O\u0026rsquo;Donovan, Chief Technical Architect, BBC Future Media \u0026amp; Technology\nTo get a real feeling about the load of the triplestore behind BBC\u0026rsquo;s World Cup web site, here are some statistics:\n800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;\nAverage unique page requests/day: 2 million;\nAverage SPARQL queries/day: 1 million;\n100s repository updates/inserts per minute with OWL 2 RL reasoning;\nMulti data center that is fully resilient, clustered 6 node triplestore.\nThe Semantic Publishing Benchmark LDBC\u0026rsquo;s Semantic Publishing Benchmark (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).\nSPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:\nReference knowledge: taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);\nMetadata for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.\nIn this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:\nAggregation queries retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;\nUpdates, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.\nSPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from Geonames for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.\nA more technical introduction to SPB can be found in this post. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this post. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.\nDespite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of \u0026ldquo;fast flowing\u0026rdquo; content need to be \u0026ldquo;dispatched\u0026rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:\nThe Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;\nReasoning is needed to map content descriptions to queries in a flexible manner;\nThere are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.\nSPB v.2.0 – steeper for the engines, closer to the publishers We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.\nThe major changes in SPB v.2.0 can be summarized as follows:\nMuch bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;\nBetter interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;\nRetrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.\n","permalink":"https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/","tags":["industry","spb"],"title":"Industry Relevance of the Semantic Publishing Benchmark"},{"categories":null,"contents":"The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.\nIn this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.\nOWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.\nThis richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.\nThe aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found here.\nSchema-Based Optimization Techniques Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.\nA simple first case is the case of constraint violation. Consider the query below, which returns all instances of class \u0026lt;A\u0026gt; which are fillers of a specific property \u0026lt;P\u0026gt;. If the underlying schema contains the information that the range of \u0026lt;P\u0026gt; is class \u0026lt;B\u0026gt;, and that class \u0026lt;B\u0026gt; is disjoint from class \u0026lt;A\u0026gt;, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.\nSELECT ?v WHERE { ?v rdf : type \u0026lt;A\u0026gt; . ?u \u0026lt;P\u0026gt; ?v . ?u \u0026lt;P\u0026gt; ?v1 . ?u \u0026lt;P1 \u0026gt; ?v2 . ?u \u0026lt;P2 \u0026gt; ?v3 . ?u \u0026lt;P3 \u0026gt; ?v4 . ?u \u0026lt;P4 \u0026gt; ?v5} Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class \u0026lt;A1\u0026gt; can immediately be identified as not being in the answer set.\nSELECT ?c WHERE { ?x rdf: type ?c . ?x \u0026lt;P\u0026gt; ?y . FILTER NOT EXISTS \\{ ?x rdf: type \u0026lt;A1 \u0026gt; }} Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.\nSimilarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).\nAs an example of these patterns, consider the query below, where class \u0026lt;C\u0026gt; is defined as the intersection of classes \u0026lt;C1\u0026gt;, \u0026lt;C2\u0026gt;. Thus, the triple pattern (?x rdf:type \u0026lt;C\u0026gt;) is more selective than (?y rdf:type \u0026lt;C1\u0026gt;) and (?z rdf:type \u0026lt;C2\u0026gt;) and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.\nSELECT ?x WHERE { ?x rdf: type \u0026lt;C\u0026gt; . ?x \u0026lt;P1 \u0026gt; ?y . ?y rdf : type \u0026lt;C1 \u0026gt; . ?y \u0026lt;P2 \u0026gt; ?z . ?z rdf : type \u0026lt;C2 \u0026gt; } Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property P4 is a symmetric property.\nSELECT ?y ?y1 ?y2 ?y3 WHERE { ?x \u0026lt;P1 \u0026gt; ?y . ?x \u0026lt;P2 \u0026gt; ?y1 . ?x \u0026lt;P3 \u0026gt; ?y2 . ?y3 \u0026lt;P4 \u0026gt; ?x } Conclusion In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:\nCases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results. Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing. Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query. Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques. This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.\n","permalink":"https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/","tags":["developer","industry"],"title":"OWL-Empowered SPARQL Query Optimization"},{"categories":null,"contents":"When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.\nWhen looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.\nReaslistic Message Content Messages\u0026rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post\u0026rsquo;s tags to other related or randomly selected tags.\nNon uniform activity levels In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.\nTime correlated post and comment generation In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in [1]. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.\nAs we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.\nIn this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.\nReferences [1] Leskovec, J., Backstrom, L., \u0026amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 497-506). ACM.\n","permalink":"https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/","tags":["snb","datagen"],"title":"Person Activity Subgraph Features in LDBC DATAGEN"},{"categories":null,"contents":"The SNB Driver part 1 post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we\u0026rsquo;ll drill down deeper into the details of what it means to execute \u0026ldquo;dependent queries\u0026rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.\nDefinitions Simulation Time (ST): notion of time created by data generator. All time stamps in the generated data set are in simulation time\nReal Time (RT): wall clock time\nTime Compression Ratio: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark\nOperation: read and/or write\nDependencies: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed\nDependents: operations in this set are dependent on at least one other operation (in Dependencies) in the workload\nDue Time (DueT): point in simulation time at which the execution of an operation should be initiated.\nDependent Time (DepT): in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.\nSafe Time (SafeT): time duration.\nwhen two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them\nSafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents\n​Operation Stream: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)\nInitiated Operations: operations that have started executing but not yet finished\nLocal Completion Time (per driver): point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))\nGlobal Completion Time (GCT): minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​\nExecution Window (Window): a timespan within which all operations can be safely executed\nAll operations satisfying window.startTime \u0026lt;= operation.DueT \u0026lt; window.endTime may be executed\nWithin a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window\nTo ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 \u0026lt; window.duration \u0026lt;= SafeT\nWindow duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable\nBefore any operations within a window can start executing it is required that: GCT \u0026gt;= window.startTime - (SafeT - window.duration)\nAll operations within a window must initiate and complete between window start and end times: window.startTime \u0026lt;= operation.initiate \u0026lt; window.endTime and window.startTime \u0026lt;= operation.complete \u0026lt; window.endTime\nDependency Mode: defines dependencies, constraints on operation execution order\nExecution Mode: defines how the runtime should execute operations of a given type\nTracking Dependencies Now, the fun part, making sure dependent operations are executed in the correct order.\nConsider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).\nLogically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:\nthe set of operations that have started executing but not yet finished. Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:\nthe set of operations that have started and finished executing. Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):\nthe point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time. LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:\nDue Time: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled\nGCT: every operation (from Dependencies) with a Due Time before this point in time has completed execution\nHowever, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:\nin addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute. Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.\nScalable execution in the Presence of Dependencies The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:\na) make the generated load less \u0026lsquo;bursty\u0026rsquo;\nb) allow the driver to \u0026lsquo;scale\u0026rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.\nIn the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.\nThe reasons this approach is correct are as follows:\nOperations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked\nThe minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)\nA window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until\nGCT \u0026gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT\nThe advantages of such an execution mode are as follows:\nAs no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window\nThen, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic\nFurther, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to \u0026lsquo;smoothen\u0026rsquo; the load within a Window.\nAs with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between \u0026lsquo;workload resolution\u0026rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the \u0026lsquo;bursty\u0026rsquo; part of the event stream.\nFigure 1. Window scheduling\nThis design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - \u0026lsquo;what happens in the window stays in the window\u0026rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.\n","permalink":"https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/","tags":["snb","driver","interactive"],"title":"SNB Driver - Part 2: Tracking Dependencies Between Queries"},{"categories":null,"contents":"Up until now we have introduced the challenges faced when executing the LDBC SNB benchmark, as well as explained how some of these are overcome. With the foundations laid, we can now explain precisely how operations are executed.\nBased on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.\nDependency Modes While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.\nAnother way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:\n• None\nDepended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)\n– Prior Execution: do nothing – After Execution: do nothing\n• Read Only\nDepended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)\nDependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)\n– Prior Execution: wait for GCT \u0026gt;= operation.DepTime – After Execution: do nothing\n• Write Only\nDepended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)\nDependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)\n– Prior Execution: add operation to Initiated Operations\n– After Execution: remove operation from Initiated Operations, add operation to Completed Operations\n• Read Write\nDepended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)\nDependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)\n– Prior Execution: add operation to Initiated Operations, wait for GCT \u0026lt; operation.DepT\n– After Execution: remove operation from Initiated Operations, add operation to Completed Operations\nExecution Modes Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.\n• Asynchronous: operations are executed individually, when their Due Time arrives.\nMotivation: This is the default execution mode, it executes operations as true to the workload definition as possible.\n– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time \u0026gt;= operation.DueT (and GCT \u0026gt;= operation.DepT)\n– Max Concurrent Executions: unbounded\n– Max Execution Time: unbounded\n– Failure: operation execution starts later than: operation.DueT Tolerated Delay\n• Synchronous: operations are executed individually, sequentially, in blocking manner.\nMotivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.\nThe alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.\n– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler\n– Execute When time \u0026gt;= operation.DueT and previousOperation.completed == true (and GCT \u0026gt;= operation.DepT)\n– Max Concurrent Executions: 1\n– Max Execution Time: nextOperation.DueT - operation.DueT\n– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay\n• Partially Synchronous (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together\n– Re-scheduling Before Execution: Yes, as long as the following still holds:\nwindow.startTime \u0026lt;= operation.DueT \u0026lt; window.startTime + window.duration\nOperations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified\n– Execute When time \u0026gt;= operation.DueT (and GCT \u0026gt;= operation.DepT)\n– Max Concurrent Executions: number of operations within window\n– Max Execution Time: (window.startTime + window.duration) - operation.DueT\n– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration\nTying it back to LDBC SNB The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.\nThe LDBC SNB data can in fact be seen as a union of two parts:\nCore Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.\nUser Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the \u0026ldquo;core\u0026rdquo; part are satisfied (i.e., users don\u0026rsquo;t post things before the profiles are created, etc.).\nIn order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.\nThis property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.\nOn the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).\n","permalink":"https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/","tags":["snb","driver","interactive"],"title":"SNB Driver - Part 3: Workload Execution Putting It All Together"},{"categories":null,"contents":"Until now we have discussed several aspects of the Semantic Publishing Benchmark (SPB) such as the difference in performance between virtual and real servers configuration, how to choose an appropriate query mix for a benchmark run and our experience with using SPB in the development process of GraphDB for finding performance issues.\nIn this post we provide a step-by-step guide on how to run SPB using the Sesame RDF data store on a fresh install of Ubuntu Server 14.04.1. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.\nPrerequisites We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:\nGit Apache Ant 1.8 or higher OpenJDK 6 or Oracle JDK 6 or higher Apache Tomcat 7 or higher If you already have these components installed on your machine you can directly proceed to the next section: Installing Sesame\nFollowing are sample commands which can be used to install the required software components:\nsudo apt-get install git sudo apt-get install ant sudo apt-get install default-jdk sudo apt-get install tomcat7 Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.\nAfter a successful installation of Apache Tomcat you should be able to get the default splash page “It works” when you open your web browser and enter the following address: http://\u0026lt;your_ip_address\u0026gt;:8080\nInstalling Sesame We will use current Sesame version 2.7.14. You can download it here or run following command:\nwget \\\\ \u0026#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download\u0026#34; \\\\ -O openrdf-sesame-2.7.14-sdk.tar.gz Then extract the Sesame tarball:\ntar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz To deploy sesame you have to copy the two war files that are in openrdf-sesame-2.7.14/war to /var/lib/tomcat7/webapps\nFrom openrdf-sesame-2.7.14/war you can do it with command:\ncp openrdf-*.war \u0026lt;tomcat_install\u0026gt;/webapps Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.\nBy default the configuration directory is: /usr/share/tomcat7/.aduna\nCreate the directory:\nsudo mkdir /usr/share/tomcat7/.aduna Then change the ownership:\nsudo chown tomcat7 /usr/share/tomcat7/.aduna And finally you should give the necessary permissions:\nsudo chmod o+rwx /usr/share/tomcat7/.aduna Now when you go to: http://\u0026lt;your_ip_address\u0026gt;:8080/openrdf-workbench/repositories\nYou should get a screen like this:\nSetup SPB You can download the SPB code and find brief documentation on GitHub:\nhttps://github.com/ldbc/ldbc_spb_bm\nA detailed documentation is located here:\nhttps://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf\nSPB offers many configuration options which control various features of the benchmark e.g.:\nquery mixes dataset size loading datasets number of agents validating results test conformance to OWL2-RL ruleset update rate of agents Here we demonstrate how to generate a dataset and execute a simple test\nrun with it.\nFirst download the SPB source code from the repository:\ngit clone https://github.com/ldbc/ldbc_spb_bm.git Then in the ldbc_spb_bm directory build the project:\nant build-basic-querymix If you simply execute the command:\nant you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.\nDepending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat\u0026rsquo;s startup files e.g. in catalina.sh:\nexport JAVA_OPTS=\u0026#34;-d64 -Xmx4G\u0026#34; To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:\nThen we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in ldbc_spb_bm/dist/test.properties file.\nThe default value of datasetSize in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.\nYou need to change\ndatasetSize=1000000 Also the URLs of the SPARQL endpoint for the repository\nendpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.\nThese are the settings to do that, following parameters will \u0026lsquo;instruct\u0026rsquo; the SPB test driver to perform all the actions described above:\n#Benchmark Operational Phases loadOntologies=true loadReferenceDatasets=true generateCreativeWorks=true loadCreativeWorks=true generateQuerySubstitutionParameters=true validateQueryResults=false warmUp=false runBenchmark=false runBenchmarkOnlineReplicationAndBackup=false checkConformance=false To run the benchmark execute the following:\njava -jar semantic_publishing_benchmark-basic-standard.jar test.properties When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.\nNext we will measure the performance of Sesame Data Store by changing some configuration properties:\n#Benchmark Configuration Parameters warmupPeriodSeconds=60 benchmarkRunPeriodSeconds=300 ... #Benchmark Operational Phases loadOntologies=false loadReferenceDatasets=false generateCreativeWorks=false loadCreativeWorks=false generateQuerySubstitutionParameters=false validateQueryResults=false warmUp=true runBenchmark=true runBenchmarkOnlineReplicationAndBackup=false checkConformance=false After the benchmark test run has finished result files are saved in folder: dist/logs\nThere you will find three types of results: the result summary of the benchmark run (semantic_publishing_benchmark_results.log), brief results and detailed results.\nIn semantic_publishing_benchmark_results.log you will find the results distributed per seconds. They should be similar to the listing bellow:\nBenchmark Results for the 300-th second\nSeconds : 300 (completed query mixes : 0) Editorial: 2 agents 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) 0.0300 average operations per second Aggregation: 8 agents 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.\nIf you have any troubles running the benchmark, don\u0026rsquo;t hesitate to comment or use our social media channels.\nIn a future post we will go through some of the parameters of SPB and check their performance implications.\n","permalink":"https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/","tags":["spb","sesame","rdf","tutorial","guide"],"title":"Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confwww-pham-peb-15/","tags":[],"title":"Deriving an Emergent Relational Schema from RDF Data"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalsws-loizou-ag-15/","tags":[],"title":"On the formulation of performant SPARQL queries"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-erling-alcgppb-15/","tags":[],"title":"The LDBC Social Network Benchmark: Interactive Workload"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-guisado-gamez-p-14/","tags":[],"title":"Understanding Graph Structure of Wikipedia for Query Expansion"},{"categories":null,"contents":"The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.\nThe SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:\nvalue-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation) The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:\ninstance (in)equality (owl:sameAs, owl:differentFrom) class and property equivalence (owl:equivalentClass, owl:equivalentProperty) class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties) class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf) property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty) complex class definitions (owl:unionOf, owl:intersectionOf) SPIMBench uses and extends the ontologies of LDBC\u0026rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB\u0026rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.\nValue and structure-based test cases are implemented using the SWING framework [1] on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.\nSPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.\nSPIMBench can be downloaded from our repository and a more thorough description thereof can be found on http://www.ics.forth.gr/isl/spimbench/.\nReferences [1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.\n","permalink":"https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/","tags":["instance matching","benchmark"],"title":"Semantic Publishing Instance Matching Benchmark"},{"categories":null,"contents":"We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.\nAs discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.\nThere are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.\nLet’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.\nOne of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.\nAnother feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.\nSince a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.\nAn orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.\nSo, let’s see how some of these aspects could be captured in the SNB context.\nGeography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.\nThe communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -\u0026gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.\nThe behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.\nIn John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”\nAnalytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.\nFrom a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.\nThis leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.\nThere is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.\n","permalink":"https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/","tags":["snb","bi"],"title":"Further Developments in SNB BI Workload"},{"categories":null,"contents":"LDBC\u0026rsquo;s Semantic Publishing Benchmark (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we wrote earlier, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.\nLately we tested different Amazon Web Services (AWS) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that GraphDB experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.\nThe Experiment For our tests we use:\nGraphDB Standard v6.1 LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: 8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also 50M dataset (SF1) 40 minutes of benchmark run time (60 seconds of warm up) 5 different Amazon EC2 instances and one local server Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.\nWe use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.\nWe also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.\nThe Results For the tests we measured:\nqueries/s for the read threads, where queries include SELECT and CONSTRUCT updates/s for the write threads, where an update operation is INSERT or DELETE queries/$ and updates/$ – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput update/vCPU – modification operations per vCPU per second Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.\nThe improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.\nTable 1. SPB Measurement Results on AWS and Local Servers\nServer Type vCPUs R/W Agents RAM (GB) \u0026ldquo;Storage (GB, SSD)\u0026rdquo; Price USD/h Queries/ sec. Updates/ sec. Queries/ USD Updates/ USD Updates/ vCPU m3.xlarge 4 8/2 15 2x 40 0.28 8.39 8.23 107 882 105 873 2.06 m3.2xlarge 8 8/2 30 2x 80 0.56 15.44 15.67 99 282 100 752 1.96 c3.xlarge 4 8/2 7.5 2x 40 0.21 7.17 6.78 122 890 116 292 1.7 c3.2xlarge 8 8/2 15 2x 80 0.42 16.46 14.56 141 107 124 839 1.82 c3.4xlarge 16 8/2 30 2x 160 0.84 23.23 21.17 99 578 90 736 1.32 c3.4xlarge 16 8/3 30 2x 160 0.84 22.89 20.39 98 100 87 386 1.27 c3.4xlarge 16 10/2 30 2x 160 0.84 26.6 19.11 114 000 81 900 1.19 c3.4xlarge 16 10/3 30 2x 160 0.84 26.19 19.18 112 243 82 200 1.2 c3.4xlarge 16 14/2 30 2x 160 0.84 30.84 16.88 132 171 72 343 1.06 c3.4xlarge 16 14/3 30 2x 160 0.84 29.67 17.8 127 157 76 286 1.11 Local 32 8/2 256 8x 256 0.85 37.11 32.04 156 712 135 302 1 Local 32 8/3 256 8x 256 0.85 37.31 32.07 157 557 135 429 1 Local 32 10/2 256 8x 256 0.85 40 31.01 168 916 130 952 0.97 Local 32 14/2 256 8x 256 0.85 36.39 26.42 153 672 111 569 0.83 Local 32 14/3 256 8x 256 0.85 36.22 26.39 152 954 111 443 0.82 Local 32 20/2 256 8x 256 0.85 34.59 23.86 146 070 100 759 0.75 The Optimal Number of Test Agents Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:\nThere is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads; Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17); Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations; For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server; Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization. There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates. Efficiency and Cost AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.\nCloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.\n$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!\nThe full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.\nTo put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:\n100 queries/sec. handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events. 10 updates/sec. - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour. $81/day is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times. ","permalink":"https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/","tags":["spb","amazon","ec2","aws","rdf"],"title":"Sizing AWS Instances for the Semantic Publishing Benchmark"},{"categories":null,"contents":"In previous posts (Getting started with snb, DATAGEN: data generation for the Social Network Benchmark), Arnau Prat discussed the main features and characteristics of DATAGEN: realism, scalability, determinism, usability. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the instructions for generating a SNB dataset and for loading the dataset into Virtuoso. In the following sections, we analyze several aspects of the generated dataset.\nA Realistic social graph One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about Facebook Anatomy). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook\u0026rsquo;s graph distribution.\nFigure 1: Cumulative distribution #friends per user\n#R script for generating the social degree distribution #Input files: person_knows_person_*.csv library(data.table) library(igraph) library(plotrix) require(bit64) dflist \u0026lt;- lapply(commandArgs(trailingOnly = TRUE), fread, sep=\u0026#34;|\u0026#34;, header=T, select=1:2, colClasses=\u0026#34;integer64\u0026#34;) df \u0026lt;- rbindlist(dflist) setNames(df, c(\u0026#34;P1\u0026#34;, \u0026#34;P2\u0026#34;)) d2 \u0026lt;- df[,length(P2),by=P1] pdf(\u0026#34;socialdegreedist.pdf\u0026#34;) plot(ecdf(d2$V1),main=\u0026#34;Cummulative distribution #friends per user\u0026#34;, xlab=\u0026#34;Number of friends\u0026#34;, ylab=\u0026#34;Percentage number of users\u0026#34;, log=\u0026#34;x\u0026#34;, xlim=c(0.8, max(d2$V1) + 20)) dev.off() Data Correlations Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.\nWhich are the most popular names of a country?\nWe run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, \u0026lsquo;A_country_name\u0026rsquo; is the name of a particular country such as \u0026lsquo;Germany\u0026rsquo;, \u0026lsquo;Netherlands\u0026rsquo;, or \u0026lsquo;Vietnam\u0026rsquo;.\nSELECT p_lastname, count (p_lastname) as namecnt FROM person, country WHERE p_placeid = ctry_city and ctry_name = \u0026#39;A_country_name\u0026#39; GROUP BY p_lastname order by namecnt desc; As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as Muller is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names\u0026rsquo; distribution may not be exactly the same as the contemporary names\u0026rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.\nFigure 2. Distribution of names in Germany\nFigure 3. Distribution of names in Netherlands\nFigure 4. Distribution of names in Vietnam\nWhere my friends are living?\nWe run the following query, which computes the locations of the friends of people living in China.\nSELECT top 10 fctry.ctry_name, count (*) from person self, person friend, country pctry, knows, country fctry WHERE pctry.ctry_name = \u0026#39;China\u0026#39; and self.p_placeid = pctry.ctry_city and k_person1id = self.p_personid and friend.p_personid = k_person2id and fctry.ctry_city = friend.p_placeid GROUP BY fctry.ctry_name ORDER BY 2 desc; As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.\nFigure 5. Locations of friends of people in China\nWhere my friends are studying?\nFinally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.\nSELECT top 10 o2.o_name, count(o2.o_name) from knows, person_university p1, person_university p2, organisation o1, organisation o2 WHERE p1.pu_organisationid = o1.o_organisationid and o1.o_name=\u0026#39;Hangzhou_International_School\u0026#39; and k_person1id = p1.pu_personid and p2.pu_personid = k_person2id and p2.pu_organisationid = o2.o_organisationid GROUP BY o2.o_name ORDER BY 2 desc; As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).\nFigure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.\nName # of friends Hangzhou_International_School 12696 Anhui_University_of_Science_and_Technology 4071 China_Jiliang_University 3519 \u0026hellip; Darmstadt_University_of_Applied_Sciences 1 Calcutta_School_of_Tropical_Medicine 1 Chettinad_Vidyashram 1 Women\u0026rsquo;s_College_Shillong 1 Universitas_Nasional 1 Table 1. Universities where friends of Hangzhou International School students are studying at.\nIn a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.\n","permalink":"https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/","tags":["developer","industry"],"title":"DATAGEN: a Realistic Social Network Data Generator"},{"categories":null,"contents":"In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: https://github.com/ldbc/ldbc_driver/. Multiple reference implementations by two vendors are available here: https://github.com/ldbc/ldbc_snb_implementations, and discussion of the schema, data properties, and related content is available here: https://github.com/ldbc/ldbc_snb_docs.\nThe following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.\nProblem Description The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person\u0026rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.\nConsider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.\nThe next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.\n","permalink":"https://ldbcouncil.org/post/snb-driver-part-1/","tags":["snb","driver","tpc-c","interactive"],"title":"SNB Driver - Part 1"},{"categories":null,"contents":"LDBC SPB (Semantic Publishing Benchmark) is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC\u0026rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an Ontotext Graph DB deployment. Graph DB was formerly known as Owlim.\nSo, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.\nSPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:\nSome queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.\nSome queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.\nThese different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.\nAnother guiding factor of SPB was the BBC\u0026rsquo;s and others\u0026rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.\nNormally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:\nUpdates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.\nShort queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.\nAnalytics - These cover a large fraction of the dataset and are roughly linear to data size.\nA test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.\nThe benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:\n46886 MB of RDF literal text\n23924 MB of full text index for RDF literals\n23598 MB of URI strings\n21981 MB of quads, stored column-wise with default index scheme\nClearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.\nLet\u0026rsquo;s now look at a full run at unit scale, i.e. 50M triples.\nThe run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.\nThe interactive driver reports:\nSeconds run : 2144 Editorial: 2 agents 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) 39.7122 average operations per second Aggregation: 20 agents 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] 45318 total retrieval queries (0 timed-out) 22.5239 average queries per second The analytical driver reports:\nAggregation: 2 agents 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] 180 total retrieval queries (2 timed-out) 0.0862 average queries per second The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)\nThe SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (v7fasttrack at github.com, feature/analytics).\nThe minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.\nIs this good or bad? I would say that this is possible but better can certainly be accomplished.\nThe initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.\nAs an initial comment of the workload mix, I\u0026rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.\nAdjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.\nIn the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.\n","permalink":"https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/","tags":["spb","test run"],"title":"Making Semantic Publishing Execution Rules"},{"categories":null,"contents":"The LDBC consortium are pleased to announce its fifth Technical User\nCommunity (TUC) meeting.\nThis will be a one-day event at the National Hellenic Research Institute\nin Athens, Greece on Friday November 14, 2014.\nAgenda 10:30 - 11:00 Coffee Break\n11:00 - 11:10 Peter Boncz (VUA) Welcome \u0026amp; LDBC project status update (Presentation)\n11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status\nFeedback \u0026amp; Roadmap for SPB \u0026amp; OWLIM (Presentation)\n11:25 - 11:30 Orri Erling (OGL) Status, Feedback \u0026amp; Roadmap for SPB \u0026amp; Virtuoso (Presentation)\n11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback \u0026amp; Roadmap for SNB \u0026amp; Neo4J (Presentation)\n11:45 - 12:00 Orri Erling (OGL) Status, Feedback \u0026amp; Roadmap for SNB \u0026amp; Virtuoso (Presentation)\n12:00 - 12:20 Arnau Prat (UPC) \u0026amp; Andrey Gubichev Status, Feedback \u0026amp; Roadmap for SNB Interactive \u0026amp; Sparksee (Presentation ) and Business Intelligence (Presentation)\n12:20 - 12:40 Tomer Sagi, \u0026ldquo;Experience with SNB and TitanDB at HP\u0026rdquo; (Presentation )\n12:40 - 13:00 Jakob Nelson, \u0026ldquo;graphbench.org on the SNB datagen\u0026rdquo;\n13:00 - 14:30 Lunch Break@Byzantine \u0026amp; Christian Museum (link)\n14:30 - 14:50 Olaf Hartig, \u0026ldquo;Integrating the Property Graph and RDF data models\u0026rdquo; (Presentation)\\\nDocuments: arxiv/1409.3288, arxiv/1406.3399\n14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, \u0026ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines\u0026rdquo; (Presentation)\n15:10 - 15:30 Evaggelia Pitoura, \u0026ldquo;Historical Queries on Graphs\u0026rdquo; (Presentation)\n15:30 - 16:00 Coffee Break\n16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, \u0026ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases\u0026rdquo; (Presentation)\n16:20 - 16:40 Gunes Aluc, \u0026ldquo;WatDiv: How to Tune-up your RDF Data Management System\u0026rdquo; (Presentation)\n16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, \u0026ldquo;Benchmarking @LogicBlox\u0026rdquo; (Presentation)\n17:00 - 17:15 Hassan Chafi, \u0026ldquo;Oracle Labs Graph Strategy\u0026rdquo;\n17:15 - 17:25 Yinglong Xia, \u0026ldquo;Property Graphs for Industry Solution at IBM\u0026rdquo; (Presentation)\n17:25 - 17:30 Arthur Keen, \u0026ldquo;Short Introduction to SPARQLcity\u0026rdquo;\n20:30 Dinner @ Konservokouti (link)\nGet a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion\nLogistics The meeting will be held at the National Hellenic Research Foundation located in downtown Athens.\nTravel Athens, Greece\u0026rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.\nTo arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).\nYou can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: http://www.aia.gr/traveler/\n","permalink":"https://ldbcouncil.org/event/fifth-tuc-meeting/","tags":["TUC Meeting"],"title":"Fifth TUC Meeting"},{"categories":null,"contents":"The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.\nThe benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:\nclustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time) correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time) The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.\nCurrently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:\njoin ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema subselects that aggregate the query results that the optimiser should recognise and evaluate first optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.) unions to be executed in parallel optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results handling of geo-spatial predicates full-text search optimisation asynchronous execution of the aggregate sub-queries use of distinct to choose the optimal query plan We give below Query 1 of the Semantic Publishing Benchmark.\nPREFIX bbcevent:\u0026lt;http://www.bbc.co.uk/ontologies/event/\u0026gt; PREFIX geo-pos:\u0026lt;http://www.w3.org/2003/01/geo/wgs84_pos#\u0026gt; PREFIX bbc:\u0026lt;http://www.bbc.co.uk/ontologies/bbc/\u0026gt; PREFIX time:\u0026lt;http://www.w3.org/2006/time#\u0026gt; PREFIX event:\u0026lt;http://purl.org/NET/c4dm/event.owl#\u0026gt; PREFIX music-ont:\u0026lt;http://purl.org/ontology/mo/\u0026gt; PREFIX rdf:\u0026lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#\u0026gt; PREFIX foaf:\u0026lt;http://xmlns.com/foaf/0.1/\u0026gt; PREFIX provenance:\u0026lt;http://www.bbc.co.uk/ontologies/provenance/\u0026gt; PREFIX owl:\u0026lt;http://www.w3.org/2002/07/owl#\u0026gt; PREFIX cms:\u0026lt;http://www.bbc.co.uk/ontologies/cms/\u0026gt; PREFIX news:\u0026lt;http://www.bbc.co.uk/ontologies/news/\u0026gt; PREFIX cnews:\u0026lt;http://www.bbc.co.uk/ontologies/news/cnews/\u0026gt; PREFIX cconcepts:\u0026lt;http://www.bbc.co.uk/ontologies/coreconcepts/\u0026gt; PREFIX dbp-prop:\u0026lt;http://dbpedia.org/property/\u0026gt; PREFIX geonames:\u0026lt;http://sws.geonames.org/\u0026gt; PREFIX rdfs:\u0026lt;http://www.w3.org/2000/01/rdf-schema#\u0026gt; PREFIX domain:\u0026lt;http://www.bbc.co.uk/ontologies/domain/\u0026gt; PREFIX dbpedia:\u0026lt;http://dbpedia.org/resource/\u0026gt; PREFIX geo-ont:\u0026lt;http://www.geonames.org/ontology#\u0026gt; PREFIX bbc-pont:\u0026lt;http://purl.org/ontology/po/\u0026gt; PREFIX tagging:\u0026lt;http://www.bbc.co.uk/ontologies/tagging/\u0026gt; PREFIX sport:\u0026lt;http://www.bbc.co.uk/ontologies/sport/\u0026gt; PREFIX skosCore:\u0026lt;http://www.w3.org/2004/02/skos/core#\u0026gt; PREFIX dbp-ont:\u0026lt;http://dbpedia.org/ontology/\u0026gt; PREFIX xsd:\u0026lt;http://www.w3.org/2001/XMLSchema#\u0026gt; PREFIX core:\u0026lt;http://www.bbc.co.uk/ontologies/coreconcepts/\u0026gt; PREFIX curric:\u0026lt;http://www.bbc.co.uk/ontologies/curriculum/\u0026gt; PREFIX skos:\u0026lt;http://www.w3.org/2004/02/skos/core#\u0026gt; PREFIX cwork:\u0026lt;http://www.bbc.co.uk/ontologies/creativework/\u0026gt; PREFIX fb:\u0026lt;http://rdf.freebase.com/ns/\u0026gt; # Query Name : query1 # Query Description : # Retrieve creative works about thing t (or that mention t) # reasoning: rdfs:subClassOf, rdf:type # join ordering: cwork:dateModified rdf:type owl:FunctionalProperty # join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty # Choke Points : # - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified # Optimizer should use an efficient cost evaluation method for choosing the optimal join tree # - A sub-select which aggregates results. Optimizer should recognize it and execute it first # - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) # Optimizer should decide to put optional triples on top of the join tree # (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join # - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork # and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ CONSTRUCT { ?creativeWork a cwork:CreativeWork ; a ?type ; cwork:title ?title ; cwork:shortTitle ?shortTitle ; cwork:about ?about ; cwork:mentions ?mentions ; cwork:dateCreated ?created ; cwork:dateModified ?modified ; cwork:description ?description ; cwork:primaryFormat ?primaryFormat ; bbc:primaryContentOf ?webDocument . ?webDocument bbc:webDocumentType ?webDocType . ?about rdfs:label ?aboutLabel ; bbc:shortLabel ?aboutShortLabel ; bbc:preferredLabel ?aboutPreferredLabel . ?mentions rdfs:label ?mentionsLabel ; bbc:shortLabel ?mentionsShortLabel ; bbc:preferredLabel ?mentionsPreferredLabel . ?creativeWork cwork:thumbnail ?thumbnail . ?thumbnail a cwork:Thumbnail ; cwork:altText ?thumbnailAltText ; cwork:thumbnailType ?thumbnailType . } WHERE { { SELECT ?creativeWork WHERE { ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . ?creativeWork a cwork:CreativeWork ; cwork:dateModified ?modified . } ORDER BY DESC(?modified) LIMIT 10 } ?creativeWork a cwork:CreativeWork ; a ?type ; cwork:title ?title ; cwork:dateModified ?modified . OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } OPTIONAL { ?creativeWork cwork:description ?description . } OPTIONAL { ?creativeWork cwork:about ?about . OPTIONAL { ?about rdfs:label ?aboutLabel . } OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } } OPTIONAL { ?creativeWork cwork:mentions ?mentions . OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } } OPTIONAL { ?creativeWork cwork:dateCreated ?created . } OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } } OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } } OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } } } Listing 1. Semantic Publishing Benchmark: Query 1\nThe benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.\nThe data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.\nThe generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).\nBelow we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.\nSeconds run : 600 Editorial: 0 agents 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) 0.0000 average operations per second Aggregation: 8 agents 298 Q1 queries 267 Q2 queries 243 Q3 queries 291 Q4 queries 320 Q5 queries 286 Q6 queries 255 Q7 queries 274 Q8 queries 271 Q9 queries 2505 total retrieval queries 4.1750 average queries per second Listing 2. A snippet of semantic_publishing_benchmark_results.log\nWe run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:\n#triples Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 #queries avg. #q. per sec. 10M 298 267 243 291 320 286 255 274 271 2505 41,750 100M 53 62 51 52 44 62 25 55 45 449 7,483 1B 34 29 22 24 25 29 0 29 28 220 3,667 ","permalink":"https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/","tags":["spb","sparql"],"title":"Getting Started With the Semantic Publishing Benchmark"},{"categories":null,"contents":"The Linked Data Benchmark Council (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its https://ldbcouncil.org website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (next TUC meeting will be on October 5 in Athens) and indeed in designing benchmarks.\nSo, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by TPC have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are relevant and representative (address important challenges encountered in practice), understandable , economical (implementable on simple hardware), fair (such as not to favor a particular product or approach), scalable, accepted by the community and public (e.g. all of its software is available in open source). This list stems from Jim Gray\u0026rsquo;s Benchmark Handbook. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.\nA very important aspect of benchmark development is making sure that the community accepts a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on fairness had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its Technical User Community gatherings) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.\nThe need for understandability for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the Social Network Benchmark (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.\nThe economical aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.\nRepresentative benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the DBpedia benchmark whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).\nThe fact that a benchmark can be scaled in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) \u0026ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.\nNow, what makes a benchmark relevant? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of \u0026ldquo;choke points\u0026rdquo;: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.\nTo illustrate what choke points are in more depth, we wrote a paper in the TPCTC 2013 conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. This table lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):\nI would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can trivialize Q18; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.\nLDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark (SPB) and the more graph-focused Social Network Benchmark (SNB), and tell us what you think. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.\n(for more posts from Peter Boncz, see also Database Architects, a blog about data management challenges and techniques written by people who design and implement database systems)\n","permalink":"https://ldbcouncil.org/post/choke-point-based-benchmark-design/","tags":["database","benchmark","design"],"title":"Choke Point Based Benchmark Design"},{"categories":null,"contents":"The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.\nThe Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth Technical User Community meeting. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.\nYou can also see that we created a this new website and a new logo. This website is different from http://ldbc.eu that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.\nIn the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.\n","permalink":"https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/","tags":["developer","industry"],"title":"New Website Online LDBC Benchmarks Reach Public Draft"},{"categories":null,"contents":"Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.\nFrom a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.\nWith the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, SNB, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.\nThe SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the LDBC Github repository. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the ACM SIGMOD programming contest 2014.\nThe SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.\nMore details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.\n","permalink":"https://ldbcouncil.org/post/social-network-benchmark-goals/","tags":["snb","datagen","interactive","bi","graphalytics"],"title":"Social Network Benchmark Goals"},{"categories":null,"contents":"It is with great pleasure that we announce the new LDBC organisation site at www.ldbcouncil.org. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the benchmarks menu on this site.\nThose benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.\nWhile the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.\nWe want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.\nFinally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.\nIn all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.\n","permalink":"https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/","tags":["ldbc"],"title":"Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies"},{"categories":null,"contents":"Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.\nMore at: http://events.sti2.at/bersys2014/\n","permalink":"https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/","tags":["workshop","cfp","benchmark","bersys"],"title":"2nd International Workshop on Benchmarking RDF Systems"},{"categories":null,"contents":"As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 [1].\nOne of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.\nFor these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:\nRealism. The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook [2]. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated. Scalability. Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters. Determinism. DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible. Usability. LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible. Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (https://github.com/ldbc/ldbc_snb_datagen).\nReferences [1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. \u0026ldquo;S3g2: A scalable structure-correlated social graph generator.\u0026rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.\n[2] Prat-Pérez, Arnau, and David Dominguez-Sal. \u0026ldquo;How community-like is the structure of synthetically generated graphs?.\u0026rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.\n","permalink":"https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/","tags":["datagen","social network","snb"],"title":"DATAGEN: Data Generation for the Social Network Benchmark"},{"categories":null,"contents":"In a previous blog post titled \u0026ldquo;Is SNB like Facebook\u0026rsquo;s LinkBench?\u0026rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.\nDATAGEN DATAGEN is the data generator used by all the workloads of SNB. Here we introduced the design goals that drive the development of DATAGEN, which can be summarized as: Realism, Scalability, Determinism and Usability.\nDATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.\nFor the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:\nRealistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.\nCorrelated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.\nDATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.\nFinally, DATAGEN outputs two other things:\nUpdate Streams, which will be used in the future to implement updates in the workloads.\nSubstitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.\nConfiguring and using DATAGEN is easy. Please visit this page for more information.\nLDBC driver SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.\nIt is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation page.\nThe test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.\nGiven a workload consisting of a series of Operations, the test sponsor implements OperationHandlers __ for them. OperationHandlers are responsible of executing instances of an specific operation (query) type. This is done by overriding the method executeOperation(), which receives as input parameter an Operation instance and returns the result. From Operation __ instance, the operation\u0026rsquo;s input parameters can be retrieved, as well as the database connection state.\nThe database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the Db interface, which consists of three methods: onInit(), onCleanup() and getConnectionState(). onInit() is called before the benchmark is executed, and is responsible of initializing the database and registering the different OperationHandlers. onCleanup() is called after the benchmark has completed. Any resources that need to be released should be released here.\nFinally, getConnectionState() returns an instance of DbConnectionState, which encapsulates any state that needs to be shared between OperationHandler instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.\nA good example on how to implement the benchmark can be found here.\nWorkloads Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.\nInteractive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.\nBusiness Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.\nExamples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.\nFinally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.\nFinal remarks This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification draft, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.\n","permalink":"https://ldbcouncil.org/post/getting-started-with-snb/","tags":["snb","interactive","datagen"],"title":"Getting Started With SNB"},{"categories":null,"contents":"The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.\nThe benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user\u0026rsquo;s social environment and potentially access data associated with the friends or a user and their friends.\nThis is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.\nSNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.\nThe challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.\nThe dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.\nThe metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.\nDifferent technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API\u0026rsquo;s.\nSNB Interactive is an example of LDBC\u0026rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art\nThe benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.\nSubsequent posts will address the workload in more detail.\n","permalink":"https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/","tags":["snb","interactive"],"title":"Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload"},{"categories":null,"contents":"In this post, I will discuss in some detail the rationale and goals of the design of the Social Network Benchmark (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook\u0026rsquo;s own graph benchmark called LinkBench. We think SNB is the most intricate graph database benchmark to date (it\u0026rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference SIGMOD in Snowbird after being used for this year\u0026rsquo;s ACM SIGMOD Programming Contest, which was about graph analytics.\nSNB is intended to provide the following value to different stakeholders:\nFor end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to compare merits of different products and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.\nFor vendors of graph database technology, SNB provides a checklist of features and performance characteristics that helps in product positioning and can serve to guide new development.\nFor researchers, both industrial and academic, the SNB dataset and workload provide interesting challenges in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.\nI should clarify that even though the data model of SNB resembles Facebook (and we\u0026rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don\u0026rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the connection patterns between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:\nthe benchmark scenario should be understandable to a large audience, and this audience should also understand the relevance of managing such data.\nthe scenario in the benchmark should cover the complete range of challenges relevant for graph data management, according to the benchmark scope.\nthe query challenges in it should be realistic in the sense that, though synthetic, similar data and workloads are encountered in practice.\nThe SNB is in fact three distinct benchmarks with a common dataset, since there are three different workloads. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.\nInteractive Workload. The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j\u0026rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system\u0026rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the data generator and driver software stack are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.\nBusiness Intelligence Workload. There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.\nGraph Analytics Workload. This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.\nAll the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art data generator. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to LinkBench, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the low-level MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that Facebook\u0026rsquo;s IT infrastructure does not store all user data in MySQL and its modified memcached (\u0026quot;TAO\u0026quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it does matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with correlations, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a \u0026ldquo;choke point\u0026rdquo; for graph data management system where LDBC wants to stimulate innovation).\n","permalink":"https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/","tags":["developer","snb","interactive","bi","graphalytics"],"title":"Is SNB Like Facebooks LinkBench"},{"categories":null,"contents":"Synopsis: Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.\nIt is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.\nAs part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.\nNow, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.\nSo, LDBC SNB has a twofold task:\nIn order to be a credible interactive workload, it must in fact have characteristics of one In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics. Designing a workload presents specific challenges:\nThe workload must be realistic enough for users to identify with it. The workload must pose challenges and drive innovation in a useful direction. The component operations must all play a noticeable role in it. If the operation\u0026rsquo;s relative performance doe does not affect the score, why is it in the workload? The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.\nVery roughly, the choke points (technical challenges) of SNB interactive are as follows:\nRandom access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored. Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join. When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations. Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities. Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.\nIn online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.\nA key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.\nThe other aspect is the metric, typically some variation on operations per unit of time.\nAll these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.\nSo how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.\nBut SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.\nHaving this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:\nShort queries should collectively be about 45% of the CPU load. Updates will be under 5% Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others. The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.\nThere is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.\nIn the next post we will look at the actual mix and execution times on the test system.\n","permalink":"https://ldbcouncil.org/post/making-it-interactive/","tags":["snb","benchmarking","tpc","sparql","interactive"],"title":"Making It Interactive"},{"categories":null,"contents":"In previous posts (this and this) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.\nGetting and Configuring Hadoop DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:\n$ cd /home/user $ tar xvfz hadoop-1.2.1.tar.gz For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the LDBC DATAGEN wiki.\nGetting and configuring DATAGEN Before downloading DATAGEN, be sure to fulfill the following requirements:\nLinux based machine java 1.6 or greater python 2.7.X maven 3 After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found here. Again, decompress the downloaded file with the following commands:\n$ cd /home/user $ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz This will create a folder called “ldbc_snb_datagen-0.1.2”.\nDATAGEN provides a run.sh is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.\nHADOOP_HOME=/home/user/hadoop-1.2.1 LDBC_SNB_DATAGEN_HOME=/home/user/ldbc_snb_datagen HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute run.sh script to compile and execute DATAGEN using default parameters. Type the following commands:\n$ cd /home/user/ldbc_snb_datagen-0.1.2 $ ./run.sh This will run DATAGEN, and two folders will be created at the same directory: social_network containing the scale factor 1 dataset with csv uncompressed files, and substitution_parameters containing the substituion parameters needed by the driver to execute the benchmark.\nChanging the generated dataset The characteristics of the dataset to be generated are specified in the params.ini file. By default, this file has the following content:\nscaleFactor:1 compressed:false serializer:csv numThreads:1 The following is the list of options and their default values supported by DATAGEN:\nOption Default value Description scaleFactor 1 \u0026ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000\u0026rdquo; serializer csv \u0026ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl\u0026rdquo; compressed FALSE Specifies to compress the output data in gzip. outputDir ./ Specifies the folder to output the data. updateStreams FALSE \u0026ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static\u0026rdquo; numThreads 1 Sets the number of threads to use. Only works for pseudo-distributed mode For instance, a possible params.ini file could be the following:\nscaleFactor:30 serializer:ttl compressed:true updateStreams:false outputDir:/home/user/output numThreads:4 For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):\nOption Default value Description numPersons - The number of persons to generate numYears - The amount of years of activity startYear - The start year of simulation. The following is an example of another possible params.ini file\nnumPersons:100000 numYears:3 startYear:2010 serializer:csv_merge_foreign compressed:false updateStreams:true outputDir:/home/user/output numThreads:4 For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at GitHub!\n","permalink":"https://ldbcouncil.org/post/snb-data-generator-getting-started/","tags":["datagen","snb","social network"],"title":"SNB Data Generator - Getting Started"},{"categories":null,"contents":"Note: consider this post as a continuation of the \u0026ldquo;Making it interactive\u0026rdquo; post by Orri Erling.\nI have now completed the Virtuoso TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. TPC-H is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.\nSo, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.\nThe BI part is like TPC-H, except for adding the following challenges:\nJoins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.\nTransitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.\nTransitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.\nGraph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.\nRunning one query with parameters from different buckets, implying different best plan.\nData correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.\nLarge intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.\nMore unions and outer joins.\nThe idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.\nFor rules and metric, we can use the TPC-H or TPC-DS ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and Open Street Map. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.\nDoing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.\nAs a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or \u0026ldquo;Platonic\u0026rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.\nSo, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.\nThe distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.\nThis brings us to the graph analytics proper, which is often done in BSP style, e.g. Pregel, Giraph, Signal-Collect, some but not all Green-Marl applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.\nWith BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as graphbench.org but will be adapted to the SNB dataset.\nThe analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.\n","permalink":"https://ldbcouncil.org/post/the-day-of-graph-analytics/","tags":["analytics","snb"],"title":"The Day of Graph Analytics"},{"categories":null,"contents":"During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC Semantic Publishing Benchmark (LDBC-SPB) as a part of our development and release process.\nFirst thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.\nInitially we’ve decided to fix some of the benchmark parameters :\nthe dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run. Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.\nWe’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.\nFollowing two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.\nFigure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.\nFigure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.\nAnother thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark\u0026rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.\nAn additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called checkConformance. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.\n","permalink":"https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/","tags":["ldbc","spb","rdf"],"title":"Using LDBC SPB to Find OWLIM Performance Issues"},{"categories":null,"contents":"The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.\nThis will be a one-day event at CWI in Amsterdam on Thursday April 3, 2014.\nThe event will include:\nIntroduction to the objectives and progress of the LDBC project. Description of the progress of the benchmarks being evolved through Task Forces. Users explaining their use-cases and describing the limitations they have found in current technology. Industry discussions on the contents of the benchmarks. All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu\nFor presenters please limit your talks to just 15 minutes\nAgenda April 3rd\n10:00 Peter Boncz (VUA) – pptx, video: LDBC project status update\n10:20 Norbert Martinez (UPC) – pdf, video: Status update on the LDBC Social Network Benchmark (SNB) task force.\n10:50 Alexandru Iosup (TU Delft) – ppt, video: Towards Benchmarking Graph-Processing Platforms\n11:10 Mike Bryant (Kings College) – pptx, video: EHRI Project: Archival Integration with Neo4j\n11:30 coffee\n11:50 Thilo Muth (University of Magdeburg) – pptx, video: MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis\n12:10 Davy Suvee (Janssen Pharmaceutica / Johnson \u0026amp; Johnson) – video: Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph\n12:30 Yongming Luo (TU Eindhoven) – pdf, video: Regularities and dynamics in bisimulation reductions of big graphs\n12:50 Christopher Davis (TU Delft) – pdf, video: Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues\n13:10 - 14:30 lunch @ restaurant Polder\n14:30 SPB task force report\n15:00 Bastiaan Bijl (Sysunite) – pdf, video: Using a semantic approach for monitoring applications in large engineering projects\n15:20 Frans Knibbe (Geodan) – pptx, video: Benchmarks for geographical data\n15:40 Armando Stellato (University of Rome, Tor Vergata \u0026amp; UN Food and Agriculture Organization) – pptx, video: VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges\n16:00 coffee\n16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), video:Customer experiences in implementing SKOS-based vocabularymanagement systems\n16:40 Simon Jupp (European Bioinformatics Institute) – pdf, video: [Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]\n17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – pdf, video: Breakmarking UniProt RDF. SPARQL queries that make your database cry\u0026hellip;\n17:20 Rein van \u0026rsquo;t Veer (Digital Heritage Netherlands) – pptx, video Time and space for heritage\n17:40 end of meeting\n19:00 - 21:30 Social Dinner in restaurant Boom\nApril 4th\nLDBC plenary meeting for project partners.\nBenchmarking Graph-Processing Platforms: A Vision – Alexandru Iosup Logistics The meeting will be held at the Dutch national research institute for computer science and mathematics (CWI - Centrum voor Wiskunde en Informatica). It is located at Amsterdam Science Park:\n(A5 map)\nTravel Arriving \u0026amp; departing:\nAmsterdam has a well-functioning and nearby airport called Schiphol (AMS, www.schiphol.nl) that serves all main European carriers and also very many low-fare carriers.\nhttp://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane\nTrains (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) \u0026ndash; which station you are also likely arriving at in case of an international train trip.\nFrom the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 \u0026ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).\nTaxi is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).\nPublic transportation (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.\nOnly the \u0026ldquo;disposable\u0026rdquo; cards are interesting for you as visitor.\nMulti-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.\nGetting Around: the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.\nCars\nIn case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the \u0026ldquo;WCW\u0026rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.\nArriving at CWI: Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you\u0026rsquo;ll receive a visitor\u0026rsquo;s pass that allows you to enter our building.\nSocial Dinner\nThe social dinner will take place at 7pm on April 3 in Restaurant Boom (boometenendrinken.nl), Linneausstraat 63, Amsterdam.\n","permalink":"https://ldbcouncil.org/event/fourth-tuc-meeting/","tags":["TUC Meeting"],"title":"Fourth TUC meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalsercim-angles-pb-14/","tags":[],"title":"Benchmarking Linked Open Data Management Systems"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-bookscrclinked-14-boncz-ep-14/","tags":[],"title":"Experiences with Virtuoso Cluster RDF Column Store"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confedbt-gubichev-014/","tags":[],"title":"Exploiting the query structure for efficient join ordering in SPARQL queries"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-gubichev-t-14/","tags":[],"title":"Graph Pattern Matching - Do We Have to Reinvent the Wheel?"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confwww-prat-perez-dl-14/","tags":[],"title":"High quality, scalable and parallel community detection for large real graphs"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-prat-d-14/","tags":[],"title":"How community-like is the structure of synthetically generated graphs?"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conficde-gubichev-ab-14/","tags":[],"title":"How to generate query parameters in RDF benchmarks?"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confrweb-larriba-pey-md-14/","tags":[],"title":"Introduction to Graph Databases"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/ldbc-spc-specification/","tags":[],"title":"LDBC Semantic Publishing Benchmark (SPB) - v2.0"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalsercim-pham-b-14/","tags":[],"title":"MonetDB/RDF: Discovering and Exploiting the Emergent Schema of RDF Data"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conftpctc-gubichev-b-14/","tags":[],"title":"Parameter Curation for Benchmark Queries"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalssigmod-angles-blf-0-enmkt-14/","tags":[],"title":"The Linked Data Benchmark Council: A graph and RDF industry benchmarking effort"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-martinez-bazan-d-14/","tags":[],"title":"Using semijoin programs to solve traversal queries in graph databases"},{"categories":null,"contents":"The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!\nThis will be a one day event in London on the 19 November 2013 running in collaboration with the GraphConnect event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: LDBCTUC.\nThe TUC event will include:\nIntroduction to the objectives and progress of the LDBC project Description of the progress of the benchmarks being evolved through Task Forces Users explaining their use-cases and describing the limitations they have found in current technology Industry discussions on the contents of the benchmarks We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.\nWe will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.\nAll users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu\nAgenda Logistics LDBC/TUC Background Social Network Benchmark Semantic Publishing Benchmark Agenda November 19th - Public TUC Meeting\n8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)\nshort LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)\nNOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.\n10:00 TUC Meeting Opening (Peter Boncz)\n10:10 TUC Presentations (RDF Application Descriptions)\nJohan Hjerling (BBC): BBC Linked Data and the Semantic Publishing Benchmark Andreas Both (Unister): Ontology-driven applications in an e-commerce context Nuno Carvalho (Fujitsu Laboratories Europe): Fujitsu RDF use cases and benchmarking requirements Robina Clayphan (Europeana): Europeana and Open Data 11:30 Semantic Publishing Benchmark (SPB)\nVenelin Kotsev (Ontotext - LDBC): Semantic Publishing Benchmark Task Force Update and report 12:00-13:00 Lunch at the Graph Connect venue\nTalks During Lunch:\nPedro Furtado, Jorge Bernardino (Univ. Coimbra): KEYSTONE Cost Action 13:00 TUC Presentations (Graph Application Descriptions)\nMinqi Zhou / Weining Qian (East China Normal University): Elastic and realistic social media data generation Andrew Sherlock (Shapespace): Shapespace Use Case Sebastian Verheughe (Telenor): Real-time Resource Authorization 14:00 Social Network Benchmark (SNB)\nNorbert Martinez (UPC - LDBC): Social Network Benchmark Task Force Update and Report 14:30 Break\n14:45 TUC Presentations (Graph Analytics)\nKeith Houck (IBM): Benchmarking experiences with [System G Native Store (tentative title)] Abraham Bernstein (University of Zurich): Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store Luis Ceze (University of Washington): Grappa and GraphBench Status Update 15:45 Break\n16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*\nChristian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): CIDOC-CRM Atanas Kiryakov (Ontotext): Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM) Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): Geographica: A Benchmark for Geospatial RDF Stores Xavier Lopez (Oracle): W3C Property Graph progress Thomas Scharrenbach (University Zurich) PCKS: Benchmarking Semantic Flow Processing Systems 17:20 Meeting Conclusion (Josep Larriba Pey)\n17:30 End of TUC meeting\n19:00 Social dinner\nNovember 20th - Internal LDBC Meeting\n10:00 Start\n12:30 End of meeting\ncoffee and lunch provided Logistics Date\n19th November 2013\nLocation\nThe TUC meeting will be held in The Tower hotel (Google Maps link) approximately 4 minutes walk from the GraphConnect conference in London.\nGetting there\nFrom City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy. From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. See attached. LDBC/TUC Background Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:\nLDBC_SNB_Report_Nov2013.pdf LDBC_SPB_Report_Nov2013.pdf A summary of these efforts can be read below or, for a more detailed account, please refer to: The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.\nSocial Network Benchmark The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:\nit should be understandable, and the relevance of managing such data should be understandable it should cover the complete range of interesting challenges, according to the benchmark scope the queries should be realistic, i.e., similar data and workloads are encountered in practice SNB includes a data generator for creation of synthetic social network data with the following characteristics:\ndata schema is representative of real social networks data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions the software generator is easy-to-use, configurable and scalable SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:\nInteractive Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting. The targeted systems are expected to be those that offer transactional functionality. Business Intelligence Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization. The targeted systems are expected to be those that offer an abstract query language. Graph Analytics Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language. Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation. The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures. Semantic Publishing Benchmark The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.\nThe scenario is a media organization that maintains RDF descriptions of its catalogue of creative works \u0026ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.\nThe data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.\nTwo separate workloads are modeled in SPB:\nEditorial: Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies. Aggregation: Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as \u0026ldquo;dynamic\u0026rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page. Status of the Semantic Publishing Benchmark\n","permalink":"https://ldbcouncil.org/event/third-tuc-meeting/","tags":["TUC Meeting"],"title":"Third TUC Meeting"},{"categories":null,"contents":"The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.\nThis will be a two day event in Munich on the 22/23rd April 2013.\nThe event will include:\nIntroduction to the objectives and progress of the LDBC project. Description of the progress of the benchmarks being evolved through Task Forces. Users explaining their use-cases and describing the limitations they have found in current technology. Industry discussions on the contents of the benchmarks. All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu\nAgenda Logistics Date Location Venue Getting to the TUM Campus from the Munich city center: Subway (U-Bahn) Getting to the TUM Campus from the Munich Airport Getting to the TUM Campus from Garching: U-Bahn Getting there Social Dinner Agenda April 22nd\n10:00 Registration.\n10:30 Josep Lluis Larriba Pey (UPC) - Welcome and Introduction.\n10:30 Peter Boncz (VUA): LDBC: goals and status\nSocial Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)\n11:00 Josep Lluis Larriba Pey (UPC): Social Network Benchmark Task Force\n11:30 Gustavo González (Mediapro): Graph-based User Modeling through Real-time Social Streams\n12:00 Klaus Großmann (Dshini): Neo4j at Dshini\n12:30 Lunch\nSemantic Publishing Use Cases (with discussion moderated by Barry Bishop)\n13:30 Barry Bishop (Ontotext): Semantic Publishing Benchmark Task Force\n14:00 Dave Rogers (BBC): Linked Data Platform at the BBC\n14:30 Edward Thomas (Wolters Kluwer): Semantic Publishing at Wolters Kluwer\n15:00 Coffee break\nProjects Related to LDBC\n15:30 Fabian Suchanek (MPI): \u0026ldquo;YAGO: A large knowledge base from Wikipedia and WordNet\u0026rdquo;\n16:00 Antonis Loziou (VUA): The OpenPHACTS approach to data integration\n16:30 Mirko Kämpf (Brox): \u0026ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case\u0026rdquo;\n17:00 End of first day\n19:00 Social dinner\nApril 23rd\nIndustry \u0026amp; Hardware Aspects\n10:00 Xavier Lopez (Oracle): Graph Database Performance an Oracle Perspective.pdf\n10:30 Pedro Trancoso (University of Cyprus): \u0026ldquo;Benchmarking and computer architecture: the research side\u0026rdquo;\n11:00 Coffee break\nFuture Steps and TUC feedback session\n11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force\n12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force\u0026quot;\n12:30 End of meeting\nLogistics Date 22nd and 23th April 2013\nLocation The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:\nLRZ (Leibniz-Rechenzentrum)\nBoltzmannstraße 1\n85748 Garching, Germany\nVenue To reach the campus, there are several options, including Taxi and Subway Ubahn\nGetting to the TUM Campus from the Munich city center: Subway (U-Bahn) Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.\nGetting to the TUM Campus from the Munich Airport (except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.\nS-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.\nTaxi: fare is ca. 30-40 euros.\nFor cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.\nGetting to the TUM Campus from Garching: U-Bahn The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.\nFinding LRZ@TUM\nOpenStreetMap link\nGoogle Maps link\nGetting there Flying: Munich airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.\nS-Bahn: S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.\nTaxi: taxi from the airport to the city center costs approximately 50 euros\nSocial Dinner The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)\nAddress: Hofbräuhaus, Platzl 9, Munich\n","permalink":"https://ldbcouncil.org/event/second-tuc-meeting/","tags":["TUC Meeting"],"title":"Second TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confamw-angles-br-13/","tags":[],"title":"A Practical Query Language for Graph DBs"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-angles-pdl-13/","tags":[],"title":"Benchmarking database systems for social network applications"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-ma-wqyxz-13/","tags":[],"title":"On benchmarking online social media analytical queries"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalscorrabs-1301-5121/","tags":[],"title":"Partitioning Graph Databases - A Quantitative Evaluation"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conficde-pham-13/","tags":[],"title":"Self-organizing structured RDF in MonetDB"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-gubichev-bs-13/","tags":[],"title":"Sparqling Kleene: fast property paths in RDF-3X"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalsdbsk-boncz-fgl-013/","tags":[],"title":"The Linked Data Benchmark Council Project"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsigmod-cattuto-qpa-13/","tags":[],"title":"Time-varying social networks in a graph database: a Neo4j use case"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conftpctc-boncz-ne-13/","tags":[],"title":"TPC-H Analyzed: Hidden Messages and Lessons Learned from an Influential Benchmark"},{"categories":null,"contents":"The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the 19/20th November 2012.\nSo far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:\nIntroduction by the coordinator and technical director explaining the objectives of the LDBC project Invitation to users to explain their use-cases and describe the limitations they have found in current technology Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points The exact agenda will be published here as things get finalised before the event.\nAll users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu\nAgenda Slide Logistics Date Location Venue Getting there Agenda We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.\nDay 1\n09:00 Welcome (Location: Aula Master)\n09:30 Project overview (Emphasis on task forces?) + Questionnaire results?\n10:30 Coffee break\n11:00 User talks (To gather information for use cases?)\n13:00 Lunch\n14:00 User talks (cont.)\n15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).\n16:00 Task force proposals (consortium)\n17:00 Finish first day\n20:00 Social dinner\nDay 2\n10:00 Task force discussion (consortium + TUC)\n11:00 Coffe break\n11:30 Task force discussion (consortium + TUC)\n12:30 Summaries (Task forces, use cases, \u0026hellip;) and actions\n13:00 Lunch and farewell\n15:00 LDBC Internal meeting\nSlide Opening session:\nCWI – Peter Boncz – Objectives UPC – Larri – Questionnaire User stories:\nBBC – Jem Rayfield CA Technologies – Victor Muntés Connected Discovery (Open Phacts) – Bryn Williams-Jones Elsevier – Alan Yagoda ERA7 Bioinformatics – Eduardo Pareja Press Association – Jarred McGinnis RJLee – David Neuer Yale – Lec Maj Benchmark proposals:\nPublishing benchmark proposal – Ontotext – Barry Bishop Social Network Benchmark Proposal – UPC – Larri Logistics Date 19th and 20th November 2012\nLocation The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:\nAula Master\nEdifici A3, Campus Nord UPC\nC. Jordi Girona, 1-3\n08034 Barcelona, Spain\nVenue To reach the campus, there are several options, including Taxi, Metro and Bus.\nFinding UPC\nFinding the meeting room\nGetting there Flying: Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this map of the airport). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.\nRail: The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.\nBus: The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.\nTaxi: From the airport, you can take one of Barcelona\u0026rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.\nTrain and bus: Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: http://www.barcelona-airport.com/eng/transport_eng.htm\nThe locations of the airport and the city centre\nBus map\n","permalink":"https://ldbcouncil.org/event/first-tuc-meeting/","tags":["TUC Meeting"],"title":"First TUC Meeting"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confcikm-gubichev-n-12/","tags":[],"title":"Fast approximation of Steiner trees in large graphs"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confedbt-tsialiamanis-sfcb-12/","tags":[],"title":"Heuristics-based query optimisation for SPARQL"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-confsemweb-phuoc-dpbef-12/","tags":[],"title":"Linked Stream Data Processing Engines: Facts and Figures"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-conftpctc-pham-be-12/","tags":[],"title":"S3G2: A Scalable Structure-Correlated Social Graph Generator"},{"categories":[],"contents":"","permalink":"https://ldbcouncil.org/publication/dblp-journalsdebu-erling-12/","tags":[],"title":"Virtuoso, a Hybrid RDBMS/Graph Column Store"},{"categories":null,"contents":"Redirecting\u0026hellip;\n","permalink":"https://ldbcouncil.org/events/","tags":null,"title":""},{"categories":null,"contents":"","permalink":"https://ldbcouncil.org/pages/page-name/","tags":null,"title":""},{"categories":null,"contents":"Redirecting\u0026hellip;\n","permalink":"https://ldbcouncil.org/posts/","tags":null,"title":""},{"categories":null,"contents":"Benefits The benefits of LDBC membership are:\nAccess to the internal LDBC information via its wiki and mailing lists, which includes access to discussion documents, graph user scenario\u0026rsquo;s, datasets, draft benchmark specifications and software, results and discussions. Right to membership of LDBC task forces. Currently, there are task forces on the Semantic Publishing Benchmark, the Social Network Benchmark Interactive, BI and the Graphalytics workloads, as well as the Graph Query Language task force. Access to research resources at academic partners. This includes the ability to look into research agenda\u0026rsquo;s provide feedback and establish working relationships with students, as well as arrange targeted internships with MSc and PhD students provided by these partners. Annual membership fees (2023) Sponsor company/institution: 8,800 GBP Commercial company: 2,200 GBP Non-commercial institution: 1,100 GBP Individual Voting member (standard rate): 165 GBP Individual Voting member (reduced rate for students, etc.): 10 GBP Individual Associate member: no subscription fee Note that there is a 2,000 GBP auditing fee per audit for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.\nForms For the latest information on becoming an LDBC member, see the LDBC Membership – Joining and Renewing 2023 document.\nPlease fill out the form applicable to your employment status and contribution and email it to the info@ldbcouncil.org address.\nMembership form (both for individuals and organizations) Individual contributor license agreement (CLA) Organization contributor license agreement (CLA) Constitutional documents See the constitutional documents page.\n","permalink":"https://ldbcouncil.org/becoming-a-member/","tags":null,"title":"Becoming a Member"},{"categories":null,"contents":"This page contains LDBC\u0026rsquo;s constitutional documents: its Articles of Association and Byelaws.\nCurrent versions Articles of Association (2023-03-30) Byelaws (v1.4, 2023-05-02) Old versions Articles of Association (2021-01-14) Byelaws (v1.3, 2021-01-14) Byelaws (v1.2, 2020-07-28) Byelaws (v1.1, 2017-03-02) ","permalink":"https://ldbcouncil.org/constitutional-documents/","tags":null,"title":"Constitutional Documents"},{"categories":null,"contents":"If you are interested in joining our benchmark task forces, please reach out at info@ldbcouncil.org.\nLicensing Our benchmarks are licensed under the Apache Software License, Version 2.0 (license file, notice file).\nContributor License Agreement To contribute to the LDBC repositories, we ask you to sign a CLA or become an LDBC member. These options are available for both individuals and organizations.\n","permalink":"https://ldbcouncil.org/developer-community/","tags":null,"title":"Developer Community"},{"categories":null,"contents":" Group leader: Petra Selmer (Neo4j) Focus: Surveying existing graph query languages Group members Angela Bonifati (Lyon 1 University) Frank Celler (ArangoDB) Victor Lee (TigerGraph) Harsh Thakkar (Consultant OSTHUS GmBH) Jeffrey Lovitz (RedisGraph) Renzo Angles (Universidad de Talca) ","permalink":"https://ldbcouncil.org/gql-community/elwg/","tags":null,"title":"Existing Languages Working Group (ELWG)"},{"categories":null,"contents":" The text of this page is based on our Byelaws.\nLDBC Benchmarks® and LDBC Benchmark® Results LDBC expects all its members to conscientiously observe the provisions of this Fair Use Policy for LDBC Benchmarks. LDBC-approved auditors must bring this Fair Use Policy for LDBC Benchmarks to the attention of any prospective or actual Test Sponsor. The Board of Directors of LDBC is responsible for enforcing this Policy and any alleged violations should be notified to info@ldbcouncil.org.\nAn “LDBC Draft Benchmark®” is a benchmark specification and any associated tooling or datasets, which has been written by an LDBC Task Force or Working Group whose charter includes the goal of achieving adoption of that specification as an LDBC standard, in accordance with Article 33 of the Articles of Association of the Company, “Approval of Standards”. An “LDBC Benchmark®” is an LDBC Draft Benchmark once it has been adopted as an LDBC standard. A result of a performance test can be fairly described as an “LDBC Benchmark Result”, if the test\u0026mdash;which may be executed in several runs all of which use the same System Under Test (SUT)\u0026mdash;has been successfully audited by an LDBC-approved auditor, and the result is reported as part of an LDBC Benchmark Results set, so it can be interpreted in context. An audit can only be successful if the audited test uses a SUT which faithfully implements the mandatory features and chosen optional features of an LDBC Benchmark , completely exercises and generates results for all the mandatory requirements and chosen optional requirements of the LDBC Benchmark, and is conducted and audited in conformance with all the relevant provisions of the LDBC Byelaws, including the statement of Total Cost of Ownership for the SUT and the reporting of price/performance metrics, such that the reported results can legitimately be used to compare the price-weighted performance of two SUTs. “LDBC Benchmark Results” is a set of all the results of a successfully audited test. A single LDBC Benchmark Result must be reported as part of such a set. Any description or depiction of a specification that states or implies that it is an LDBC Draft Benchmark or an LDBC Benchmark when that is not the case is an infringement of LDBC’s trademark in the term “LDBC BENCHMARK”, which is registered in several major jurisdictions. The same trademark is infringed by any software which is described or promoted as being an implementation of an LDBC Draft Benchmark or LDBC Benchmark, but which does not faithfully implement the features of or does not support the mandatory requirements of the stated specification. The same trademark is infringed by any report or description of one or more performance test results which are not part of set of LDBC Benchmark Results, or in any other way states or implies that the results are endorsed by or originates from LDBC. LDBC considers the use of that trademarked term with respect to performance test results solely in accordance with these Byelaws to be essential to the purpose and reputation of the Company and its benchmark standards. Reporting of LDBC Benchmark Results Once an auditor has approved a performance test result, including all required supporting documentation, as being successfully audited, then the Members Council and the Task Force responsible for the benchmark will be notified. The Board will have the results added to the LDBC web site as an LDBC Benchmark Results set according to the following procedure:\nLDBC members will receive notification of the result via email to their designated contacts within five business days of LDBC receiving the notification. Within five business days of this notice, the LDBC administrator will post the result on the LDBC web site under the rubric \u0026ldquo;LDBC Benchmark Results” unless the result is withdrawn by the Test Sponsor in the meantime. A result may be challenged and subsequently be withdrawn by the LDBC following a review process as described in Article 7.6. A result that is not challenged within 60 days of its publication will be automatically considered valid and may not be challenged after this time, and this fact will be recorded as part of the website posting of the result. Fair Use of the trademark LDBC BENCHMARK Any party wishing to avoid infringement of the trademarked term “LDBC BENCHMARK” should follow the following guidelines relating to its fair use.\nLDBC encourages use, derived use, study, descriptions, critiques of and suggestions for improvement of LDBC Draft Benchmarks and LDBC Benchmarks. Our benchmark specifications are open-source, and we always welcome new contributors and members. These guidelines are only intended to prevent false or confusing claims relating to performance test results that are intended to be used for product comparisons.\nIf your work is derived from an LDBC Draft or standard Benchmark, or is a partial implementation, or if you are using part of one of our standards for a non-benchmarking purpose, then we would expect you to give attribution, in line with our Creative Commons CC-BY 4.0 licence. We would also suggest that you make a statement, somewhere, somehow, that includes one of these phrases “This is not an LDBC Benchmark”, “This is not an implementation of an LDBC Benchmark” or “These are not LDBC Benchmark Results”. We would also suggest that you explain, however briefly, how your work is related to LDBC standards and how it varies from them. An example that illustrates these points: you might say something like this in a presentation:\n“We used the LDBC SNB benchmark as a starting point. This isn’t the official LDBC standard: we added four queries because of X, and we don’t attempt to deal with the ACID requirement. The test results aren’t audited, so we want to be clear that this is not an LDBC Benchmark test run, and these numbers are not LDBC Benchmark Results. If you look at this link on the slide I’m showing you can see all the details of how our work is derived from, and varies from, the SNB 2.0 spec.”\nOr you might say:\n“For this example of a GQL graph type we used the LDBC SNB data model. This is nothing to do with the actual LDBC benchmark specification: we just used their data model as a use-case for illustrating what a graph schema might look like. We took this from the SNB 2.0 spec.”\n","permalink":"https://ldbcouncil.org/benchmarks/fair-use-policies/","tags":null,"title":"Fair Use Policy for LDBC Benchmarks®"},{"categories":null,"contents":" Group leaders: Leonid Libkin (ENS Paris, University of Edinburgh), Paolo Guagliardo (University of Edinburgh) Focus: Establishing formal semantics for the upcoming GQL language ","permalink":"https://ldbcouncil.org/gql-community/fswg/","tags":null,"title":"Formal Semantics Working Group (FSWG)"},{"categories":null,"contents":"The Linked Data Benchmark Council (LDBC) is a non-profit organization aiming to define standard graph benchmarks to foster a community around graph processing technologies. LDBC consists of members from both industry and academia, including organizations and individuals.\nAn overview of our activites is summarized in a lightning talk at FOSDEM 2023\u0026rsquo;s HPC room (9 minutes):\nSee also our TPCTC 2023 paper and its slide deck.\nContact To learn more about LDBC, reach out at info@ldbcouncil.org.\nPost address First Floor, Two Chamberlain Square\nBirmingham\nB3 3AX\nUnited Kingdom\n","permalink":"https://ldbcouncil.org/introduction/","tags":null,"title":"Introduction"},{"categories":null,"contents":"Group leader: Alastair Green (JCC)\nActive members:\nKoji Annoura Michael Behrisch Stephen Cannan Alin Deutsch George Fletcher Thomas Friesendal Denise Gosnell Alastair Green Cole Greer Zhihui Guo Keith Hare Jan Hidders Longbin Lai Heng Lin Alessandro Mosca Stefan Plantikow Yuya Sasaki Ognjen Savkovic Michael Schmidt Dominik Tomaszuk Yang Xia Wenyuan Yu Tao Wang Dušan Živkovic and 15+ observers See the LEX work charter which details the group\u0026rsquo;s mission, motivation, and scope of work.\n","permalink":"https://ldbcouncil.org/gql-community/lex/","tags":null,"title":"LDBC Extended GQL Schema (LEX) working group"},{"categories":null,"contents":"The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as\nanti-fraud and risk control. It is maintained by the LDBC FinBench Task Force.\nThe benchmark has one workload, Transaction Workload, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph. Its data sets are available in the Google Drive.\nFor a brief overview, see the slides presented in the 16th TUC meeting. The Financial Benchmark\u0026rsquo;s specification can be found on arXiv.\nFinBench Audit Audit results There are no audited results yet.\nCommissioning audits For auditing requests, please reach out at info@ldbcouncil.org. Audits can only be commissioned by LDBC member companies by contracting any of the LDBC-certified auditors. Note that there is a 2,000 GBP auditing fee to be paid for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.\nUse of audited results Fair use policies The LDBC Social Network Benchmark is subject to the LDBC Fair Use Policies.\n","permalink":"https://ldbcouncil.org/benchmarks/finbench/","tags":null,"title":"LDBC Financial Benchmark (LDBC FinBench)"},{"categories":null,"contents":"LDBC\u0026rsquo;s working groups investigate research questions on graph query languages and language extensions for graphs including ISO/IEC SQL/PGQ, released in June 2023, and GQL, scheduled to be released in March 2024.\nActive Working Groups LDBC Extended GQL Schema (LEX) working group Historical Working Groups Property Graph Schema Working Group Existing Languages Working Group Formal Semantics Working Group ","permalink":"https://ldbcouncil.org/gql-community/overview/","tags":null,"title":"LDBC Graph Query Working Groups"},{"categories":null,"contents":"The Graphalytics benchmark is an industrial-grade benchmark for graph analysis platforms such as Giraph, Spark GraphX, and GraphBLAS. It consists of six core algorithms, standard data sets, and reference outputs, enabling the objective comparison of graph analysis platforms.\nThe benchmark harness consists of a core component, which is extendable by a driver for each different platform implementation. The benchmark includes the following algorithms:\nbreadth-first search (BFS) PageRank (PR) weakly connected components (WCC) community detection using label propagation (CDLP) local clustering coefficient (LCC) single-source shortest paths (SSSP) The choice of these algorithms was carefully motivated, using the LDBC TUC and extensive literature surveys to ensure good coverage of scenarios. The standard data sets include both real and synthetic data sets, which are classified into intuitive “T-shirt” sizes (S, M, L, etc.).\nEach experiment set in Graphalytics consists of multiple platform runs (a platform executes an algorithm on a data set), and diverse set of experiments are carried out to evaluate different performance characteristics of a system-under-test.\nAll completed benchmarks must go through a strict validation process to ensure the integrity of the performance results.\nThe development of Graphalytics is supported by many active vendors in the field of large-scale graph analytics. Currently, Graphalytics already facilitates benchmarks for a large number of graph analytics platforms, such as GraphBLAS, Giraph, GraphX, and PGX.D, allowing comparison of the state-of-the-art system performance of both community-driven and industrial-driven platforms. To get started, the details of the Graphalyics documentation and its software components are described below.\nDocuments and repositories Benchmark specification. The source code is stored in the ldbc_graphalytics_docs repository VLDB paper ldbc_graphalytics: Generic driver ldbc_graphalytics_platforms_umbra: Umbra implementation ldbc_graphalytics_platforms_graphblas: GraphBLAS implementation Graphalytics competition 2023 In 2023, we will hold a new round of the Graphalytics competition. See the LDBC Graphalytics Benchmark presentation for an introduction to the benchmark framework and the competition\u0026rsquo;s rules.\nArtifacts:\nbenchmark framework reference implementations data sets (data sets and expected results) are available on GitHub Rules Participation is free. There are no monetary prizes. Single-node and distributed implementations are allowed. Partial implementations (e.g. just small to mid-sized data sets and only a few algorithms) are allowed. Submissions should execute each algorithm-data set combination three times. From these, the arithmetic mean of the processing times is used for ranking. The results of the competition will be published on the LDBC website in the form of leaderboards, which rank them based on performance and price-performance (adjusted for the system price). There is a global leaderboard that includes all algorithms and scale factors. Additionally, there is a separate leaderboard for each scale (S, M, L, XL, 2XL+), algorithm and system category (CPU-based/GPU-based, single-node vs. distributed) to for fine-grained comparison. Submissions are subject to code review and reproducibility attempts from the organizers. System prices should be reported following the TPC Pricing specification. Recommendations for submissions Submissions using modern hardware are welcome (GPUs, FPGAs, etc.). We encourage the use of cloud compute instances for running the benchmark (if possible). Important dates March 17: Competition is announced April 25: Confirmation of intent May 1: Submissions open June 25: Submissions close Data sets The Graphalytics data sets are compressed using zstd. The total size of the compressed archives is approx. 350GB. When decompressed, the data sets require approximately 1.5TB of disk space.\nFor detailed information on the data sets, see the table with their statistics.\nThe data sets are available in two locations:\nA public Cloudflare R2 bucket This is the primary source for the data sets and is kept up-to-date upon changes The links in the table below point to this bucket Shell script to download the data sets from Cloudflare R2 Download scripts for individual sizes: test graphs, sizes up to S, size M, size L, size XL, sizes 2XL+ CWI/SURFsara data repository Backup repository Shell script to download the data sets from SURFsara Note that some of the Graphalytics data sets were fixed in March 2023. Prior to this, they were incorrectly packaged or had missing/incorrect reference outputs for certain algorithms. If you are uncertain whether you have the correct versions, cross-check them against these MD5 checksums: datagen-9_4-fb, datagen-sf3k-fb, datagen-sf10k-fb, graph500-27, graph500-28, graph500-29, graph500-30.\ndata set #nodes #edges scale link size cit-Patents 3,774,768 16,518,947 XS cit-Patents.tar.zst 119.1 MB com-friendster 65,608,366 1,806,067,135 XL com-friendster.tar.zst 6.7 GB datagen-7_5-fb 633,432 34,185,747 S datagen-7_5-fb.tar.zst 162.3 MB datagen-7_6-fb 754,147 42,162,988 S datagen-7_6-fb.tar.zst 200.0 MB datagen-7_7-zf 13,180,508 32,791,267 S datagen-7_7-zf.tar.zst 434.5 MB datagen-7_8-zf 16,521,886 41,025,255 S datagen-7_8-zf.tar.zst 544.3 MB datagen-7_9-fb 1,387,587 85,670,523 S datagen-7_9-fb.tar.zst 401.2 MB datagen-8_0-fb 1,706,561 107,507,376 M datagen-8_0-fb.tar.zst 502.5 MB datagen-8_1-fb 2,072,117 134,267,822 M datagen-8_1-fb.tar.zst 625.4 MB datagen-8_2-zf 43,734,497 106,440,188 M datagen-8_2-zf.tar.zst 1.4 GB datagen-8_3-zf 53,525,014 130,579,909 M datagen-8_3-zf.tar.zst 1.7 GB datagen-8_4-fb 3,809,084 269,479,177 M datagen-8_4-fb.tar.zst 1.2 GB datagen-8_5-fb 4,599,739 332,026,902 L datagen-8_5-fb.tar.zst 1.5 GB datagen-8_6-fb 5,667,674 421,988,619 L datagen-8_6-fb.tar.zst 1.9 GB datagen-8_7-zf 145,050,709 340,157,363 L datagen-8_7-zf.tar.zst 4.6 GB datagen-8_8-zf 168,308,893 413,354,288 L datagen-8_8-zf.tar.zst 5.3 GB datagen-8_9-fb 10,572,901 848,681,908 L datagen-8_9-fb.tar.zst 3.7 GB datagen-9_0-fb 12,857,671 1,049,527,225 XL datagen-9_0-fb.tar.zst 4.6 GB datagen-9_1-fb 16,087,483 1,342,158,397 XL datagen-9_1-fb.tar.zst 5.8 GB datagen-9_2-zf 434,943,376 1,042,340,732 XL datagen-9_2-zf.tar.zst 13.7 GB datagen-9_3-zf 555,270,053 1,309,998,551 XL datagen-9_3-zf.tar.zst 17.4 GB datagen-9_4-fb 29,310,565 2,588,948,669 XL datagen-9_4-fb.tar.zst 14.0 GB datagen-sf3k-fb 33,484,375 2,912,009,743 XL datagen-sf3k-fb.tar.zst 12.7 GB datagen-sf10k-fb 100,218,750 9,404,822,538 2XL datagen-sf10k-fb.tar.zst 40.5 GB dota-league 61,170 50,870,313 S dota-league.tar.zst 114.3 MB graph500-22 2,396,657 64,155,735 S graph500-22.tar.zst 202.4 MB graph500-23 4,610,222 129,333,677 M graph500-23.tar.zst 410.6 MB graph500-24 8,870,942 260,379,520 M graph500-24.tar.zst 847.7 MB graph500-25 17,062,472 523,602,831 L graph500-25.tar.zst 1.7 GB graph500-26 32,804,978 1,051,922,853 XL graph500-26.tar.zst 3.4 GB graph500-27 63,081,040 2,111,642,032 XL graph500-27.tar.zst 7.1 GB graph500-28 121,242,388 4,236,163,958 2XL graph500-28.tar.zst 14.4 GB graph500-29 232,999,630 8,493,569,115 2XL graph500-29.tar.zst 29.6 GB graph500-30 447,797,986 17,022,117,362 3XL graph500-30.tar.zst 60.8 GB kgs 832,247 17,891,698 XS kgs.tar.zst 65.7 MB twitter_mpi 52,579,678 1,963,263,508 XL twitter_mpi.tar.zst 5.7 GB wiki-Talk 2,394,385 5,021,410 2XS wiki-Talk.tar.zst 34.9 MB example-directed 10 17 - example-directed.tar.zst 1.0 KB example-undirected 9 12 - example-undirected.tar.zst 1.0 KB test-bfs-directed \u0026lt;100 \u0026lt;100 - test-bfs-directed.tar.zst \u0026lt;2.0 KB test-bfs-undirected \u0026lt;100 \u0026lt;100 - test-bfs-undirected.tar.zst \u0026lt;2.0 KB test-cdlp-directed \u0026lt;100 \u0026lt;100 - test-cdlp-directed.tar.zst \u0026lt;2.0 KB test-cdlp-undirected \u0026lt;100 \u0026lt;100 - test-cdlp-undirected.tar.zst \u0026lt;2.0 KB test-pr-directed \u0026lt;100 \u0026lt;100 - test-pr-directed.tar.zst \u0026lt;2.0 KB test-pr-undirected \u0026lt;100 \u0026lt;100 - test-pr-undirected.tar.zst \u0026lt;2.0 KB test-lcc-directed \u0026lt;100 \u0026lt;100 - test-lcc-directed.tar.zst \u0026lt;2.0 KB test-lcc-undirected \u0026lt;100 \u0026lt;100 - test-lcc-undirected.tar.zst \u0026lt;2.0 KB test-wcc-directed \u0026lt;100 \u0026lt;100 - test-wcc-directed.tar.zst \u0026lt;2.0 KB test-wcc-undirected \u0026lt;100 \u0026lt;100 - test-wcc-undirected.tar.zst \u0026lt;2.0 KB test-sssp-directed \u0026lt;100 \u0026lt;100 - test-sssp-directed.tar.zst \u0026lt;2.0 KB test-sssp-undirected \u0026lt;100 \u0026lt;100 - test-sssp-undirected.tar.zst \u0026lt;2.0 KB ","permalink":"https://ldbcouncil.org/benchmarks/graphalytics/","tags":null,"title":"LDBC Graphalytics Benchmark (LDBC Graphalytics)"},{"categories":null,"contents":"The Semantic Publishing Benchmark (SPB) is an LDBC benchmark for testing the performance of RDF engines inspired by the Media/Publishing industry. In particular, LDBC worked with British Broadcasting Corporation BBC to define this benchmark, for which BBC donated workloads, ontologies and data. The publishing industry is an area where significant adoption of RDF is taking place.\nThere have been many academic benchmarks for RDF but none of these are truly industrial-grade. The SPB combines a set of complex queries under inference with continuous updates and special failover tests for systems implementing replication.\nSPB performance is measured by producing a workload of CRUD (Create, Read, Update, Delete) operations which are executed simultaneously. The benchmark offers a data generator that uses real reference data to produce datasets of various sizes and tests the scalability aspect of RDF systems. The benchmark workload consists of (a) editorial operations that add new data, alter or delete existing (b) aggregation operations that retrieve content according to various criteria. The benchmark also tests conformance for various rules inside the OWL2-RL rule-set.\nThe SPB specification contains the description of the benchmark and the data generator and all information about its software components can be found on the SPB developer page.\nSemantic Publishing Benchmark (SPB) Audited Results SF Triples RW Agents Interactive (Q/sec) Updates (ops/sec) Cost Software Hardware Test Sponsor Date FDR 3 256M 16 / 4 335.48 25.66 177,474 USD GraphDB EE 10.0.1 AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 3 256M 24 / 0 413.16 0.00 207,474 USD GraphDB EE 10.0.1 AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 3 256M 64 / 4 1121.76 9.53 652,422 USD GraphDB EE 10.0.1 3×AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 3 256M 64 / 0 985.63 0.00 562,422 USD GraphDB EE 10.0.1 3×AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 5 1B 16 / 4 105.76 10.45 177,474 USD GraphDB EE 10.0.1 AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 5 1B 24 / 0 158.10 0.00 207,474 USD GraphDB EE 10.0.1 AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 5 1B 64 / 4 372.56 4.04 652,422 USD GraphDB EE 10.0.1 3×AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 5 1B 64 / 0 408.68 0.00 562,422 USD GraphDB EE 10.0.1 3×AWS r6id.8xlarge Ontotext AD 2023-01-29 FDR, summary 1 64M 8 / 2 100.85 10.19 37,504 EUR GraphDB EE 6.2 Intel Xeon E5-1650v3 6×3.5Ghz, 96GB RAM Ontotext AD 2015-04-26 FDR 1 64M 8 / 2 142.76 10.67 35,323 EUR GraphDB SE 6.3 alpha Intel Xeon E5-1650v3 6×3.5GHz, 64GB RAM Ontotext AD 2015-06-10 FDR 3 256M 8 / 2 29.90 9.50 37,504 EUR GraphDB EE 6.2 Intel Xeon E5-1650v3 6×3.5Ghz, 96GB RAM Ontotext AD 2015-04-26 FDR 3 256M 8 / 2 54.64 9.50 35,323 EUR GraphDB SE 6.3 alpha Intel Xeon E5-1650v3 6×3.5GHz, 64GB RAM Ontotext AD 2015-06-10 FDR 1 64M 22 / 2 149.04 156.83 20,213 USD Virtuoso v7.50.3213 Intel Xeon E5-2630 6×2.30GHz, 192 GB RAM OpenLink Software 2015-06-09 FDR 3 256M 22 / 2 80.62 92.71 20,213 USD Virtuoso v7.50.3213 Intel Xeon E5-2630 6×2.30GHz, 192 GB RAM OpenLink Software 2015-06-09 FDR 3 256M 30 / 3 115.38 109.85 24,528 USD Virtuoso v7.50.3213 AWS r3.8xlarge OpenLink Software 2015-06-09 FDR 5 1B 22 / 2 32.28 72.72 20,213 USD Virtuoso v7.50.3213 Intel Xeon E5-2630 6×2.30GHz, 192 GB RAM OpenLink Software 2015-06-09 FDR 5 1B 30 / 3 45.81 55.45 24,528 USD Virtuoso v7.50.3213 AWS r3.8xlarge OpenLink Software 2015-06-10 FDR LDBC-certified auditors SPB audits can be commissioned from the following LDBC-certified auditors:\nPjotr Scholtze ","permalink":"https://ldbcouncil.org/benchmarks/spb/","tags":null,"title":"LDBC Semantic Publishing Benchmark (LDBC-SPB)"},{"categories":null,"contents":"This file is here so that Hugo generates the page.\n","permalink":"https://ldbcouncil.org/benchmarks/snb-bi/","tags":null,"title":"LDBC SNB Business Intelligence workload"},{"categories":null,"contents":"This file is here so that Hugo generates the page.\n","permalink":"https://ldbcouncil.org/benchmarks/snb-interactive/","tags":null,"title":"LDBC SNB Interactive workload"},{"categories":null,"contents":"The Social Network Benchmark (SNB) suite defines graph workloads targeting database management systems and is maintained by the LDBC SNB Task Force.\nThe benchmark suite consists of two distinct workloads on a common dataset:\nThe Business Intelligence workload is focusing on aggregation- and join-heavy complex queries touching a large portion of the graph with microbatches of insert/delete operations. Its data sets are available in Cloudflare R2 and in the SURF/CWI repository. The Interactive workload captures transactional graph processing scenario with complex read queries that access the neighbourhood of a given node in the graph and update operations that continuously insert new data in the graph. Its data sets are available in the CWI/SURF data repository. For a brief overview, see our talk given at FOSDEM 2023\u0026rsquo;s graph developer room. The Social Network Benchmark\u0026rsquo;s specification can be found on arXiv.\nAudited results – SNB Business Intelligence workload – SNB Interactive workload Commissioning audits For auditing requests, please reach out at info@ldbcouncil.org. Audits can only be commissioned by LDBC member companies by contracting any of the LDBC-certified auditors. Note that there is a 2,000 GBP auditing fee to be paid for the LDBC for non-sponsor company members. Sponsor companies are exempt from this.\nFor a short summary of LDBC\u0026rsquo;s auditing process, including preparation steps, timelines, and pricing, see the Auditing process for the LDBC Social Network Benchmark document.\nUse of audited results Fair use policies The LDBC Social Network Benchmark is subject to the LDBC Fair Use Policies.\nRetrospective review of publications related to LDBC benchmark standards Review of 2019 preprint (sponsored by TigerGraph) Review of 2020 whitepaper (authored by Oracle) Review of the keynote at NODES 2021 (Neo4j Online Developer Expo and Summit) ","permalink":"https://ldbcouncil.org/benchmarks/snb/","tags":null,"title":"LDBC Social Network Benchmark (LDBC SNB)"},{"categories":null,"contents":"SNB Interactive (version 0.2.2) audited results SF Throughput Cost Software Hardware Test Sponsor Date Full Disclosure Report 10 101.20 ops/s 30,427 EUR Sparksee 5.1.1 2×Xeon 2630v3 8-core 2.4GHz, 256GB RAM Sparsity Technologies SA 2015-04-27 Full Disclosure Report 30 1,287.17 ops/s 20,212 EUR Virtuoso 07.50.3213 v7fasttrack 2×Xeon2630 6-core 2.4GHz, 192GB RAM OpenLink Software 2015-04-27 Full Disclosure Report 30 86.50 ops/s 30,427 EUR Sparksee 5.1.1 2×Xeon 2630v3 8-core 2.4GHz, 256GB RAM Sparsity Technologies SA 2015-04-27 Full Disclosure Report 100 1,200.00 ops/s 20,212 EUR Virtuoso 07.50.3213 v7fasttrack 2×Xeon2630 6-core 2.4GHz, 192GB RAM OpenLink Software 2015-04-27 Full Disclosure Report 100 81.70 ops/s 37,927 EUR Sparksee 5.1.1 2×Xeon 2630v3 8-core 2.4GHz, 256GB RAM Sparsity Technologies SA 2015-04-27 Full Disclosure Report 300 635.00 ops/s 20,212 EUR Virtuoso 07.50.3213 v7fasttrack 2×Xeon2630 6-core 2.4GHz, 192GB RAM OpenLink Software 2015-04-27 Full Disclosure Report ","permalink":"https://ldbcouncil.org/benchmarks/snb/audited-results-v0.2.2/","tags":null,"title":"LDBC Social Network Benchmark legacy audited results"},{"categories":null,"contents":"This file is here so that Hugo generates the member page.\n","permalink":"https://ldbcouncil.org/organizational-members/","tags":null,"title":"Organizational Members"},{"categories":null,"contents":"LDBC currently offers the following benchmarks:\nGraphalytics: Graph algorithms for graph analytical platforms Financial Benchmark: Benchmark for financial workloads, targeting distributed systems Semantic Publishing Benchmark: An RDF-based benchmark for semantic databases Social Network Benchmark Suite (SNB): The SNB targets database management systems with graph-processing capabilities. It consists of two workloads, Interactive and Business Intelligence Uses of LDBC benchmarks are subject to the Fair Use Policy for LDBC Benchmarks.\n","permalink":"https://ldbcouncil.org/benchmarks/overview/","tags":null,"title":"Overview of LDBC Benchmarks"},{"categories":null,"contents":"Group leaders: Jan Hidders (Birkbeck College, University of London), Juan Sequeda (data.world)\nThe PGSWG has 4 sub-groups: PG-Basic, PG-Constraints, PG-Properties, PG-Nulls\nPG-Basic Group leader: Jan Hidders (Birkbeck College, University of London) Focus: Basic constructs and semantics Group members Alastair Green (JCC Consulting; Birkbeck College, University of London) Angela Bonifati (Lyon 1 University) Bei Li (Google) Dominik Tomaszuk (University of Bialystok) Enrico Franconi (Free University of Bozen-Bolzano) George Fletcher (Eindhoven TU) Gilles Privat (Orange S.A.) Hannes Voigt (Neo4j) Harsh Thakkar (Consultant OSTHUS GmBH) Jan Hidders (Birkbeck College, University of London) Jason Crawford (Amazon) Josh Perryman (VeracityID) Joshua Shinavier (LinkedIn) Juan Sequeda (data.world) Keith W. Hare (JCC Consulting) Koji Annoura (UTI, Inc.) Leonid Libkin (ENS Paris, University of Edinburgh) Liat Peterfreund (ENS Paris) Michael Schmidt (Amazon Web Services) Renzo Angles (Universidad de Talca) Slawek Staworko (Université de Lille) Stefania Dumbrava (Ecole Nationale Supérieure d\u0026rsquo;Informatique pour l\u0026rsquo;Industrie et l\u0026rsquo;Entreprise (ENSIIE)) Victor Lee (TigerGraph) Victor Marsault (CNRS) Wim Martens (University of Bayreuth) Wook-Shin Han (POSTECH) PG-Constraints Group leader: George Fletcher (TU Eindhoven) Focus: Key constraints and cardinality constraints Group members Alastair Green (JCC Consulting; Birkbeck College, University of London) Andrea Cali (Birkbeck College, University of London) Angela Bonifati (Lyon 1 University) Bei Li (Google) Borislav Iordanov (Kobrix) Dominik Tomaszuk (University of Bialystok) Enrico Franconi (Free University of Bozen-Bolzano) Filip Murlak (University of Warsaw) George Fletcher (Eindhoven TU) Jan Hidders (Birkbeck College, University of London) Jason Crawford (Amazon) Josh Perryman (VeracityID) Juan Sequeda (data.world) Keith W. Hare (JCC Consulting) Koji Annoura (UTI, Inc.) Leonid Libkin (ENS Paris, University of Edinburgh) Michael Schmidt (Amazon Web Services) Slawek Staworko (Université de Lille) Stefania Dumbrava (Ecole Nationale Supérieure d\u0026rsquo;Informatique pour l\u0026rsquo;Industrie et l\u0026rsquo;Entreprise (ENSIIE)) Wim Martens (University of Bayreuth) Wook-Shin Han (POSTECH) PG-Properties Group leader: Joshua Shinavier (LinkedIn) Focus: Data types for properties Group members Alastair Green (JCC Consulting; Birkbeck College, University of London) Angela Bonifati (Lyon 1 University) Bei Li (Google) Borislav Iordanov (Kobrix) Dominik Tomaszuk (University of Bialystok) Enrico Franconi (Free University of Bozen-Bolzano) Filip Murlak (University of Warsaw) George Fletcher (Eindhoven TU) Gilles Privat (Orange S.A.) Harsh Thakkar (Consultant OSTHUS GmBH) Jan Hidders (Birkbeck College, University of London) Jason Crawford (Amazon) Josh Perryman (VeracityID) Joshua Shinavier (LinkedIn) Juan Sequeda (data.world) Keith W. Hare (JCC Consulting) Koji Annoura (UTI, Inc.) Michael Schmidt (Amazon Web Services) Renzo Angles (Universidad de Talca) Stefania Dumbrava (Ecole Nationale Supérieure d\u0026rsquo;Informatique pour l\u0026rsquo;Industrie et l\u0026rsquo;Entreprise (ENSIIE)) Victor Lee (TigerGraph) Victor Marsault (CNRS) Wim Martens (University of Bayreuth) Wook-Shin Han (POSTECH) PG-Nulls Group leader: Angela Bonifati (Lyon 1 University) Focus: Null values Group members Alastair Green (JCC Consulting; Birkbeck College, University of London) Angela Bonifati (Lyon 1 University) Dominik Tomaszuk (University of Bialystok) Enrico Franconi (Free University of Bozen-Bolzano) Filip Murlak (University of Warsaw) Gilles Privat (Orange S.A.) Jan Hidders (Birkbeck College, University of London) Joshua Shinavier (LinkedIn) Juan Sequeda (data.world) Koji Annoura (UTI, Inc.) Leonid Libkin (ENS Paris, University of Edinburgh) Liat Peterfreund (ENS Paris) Michael Schmidt (Amazon Web Services) Paolo Guagliardo (University of Edinburgh) Slawek Staworko (Université de Lille) Stefania Dumbrava (Ecole Nationale Supérieure d\u0026rsquo;Informatique pour l\u0026rsquo;Industrie et l\u0026rsquo;Entreprise (ENSIIE)) Victor Lee (TigerGraph) Wim Martens (University of Bayreuth) Wook-Shin Han (POSTECH) Threshold queries Angela Bonifati (Lyon 1 University) Dominik Tomaszuk (University of Bialystok) Filip Murlak (University of Warsaw) George Fletcher (Eindhoven TU) Jan Hidders (Birkbeck College, University of London) Joshua Shinavier (LinkedIn) Matthias Hofer (University of Bayreuth) Slawek Staworko (Université de Lille) Stefania Dumbrava (Ecole Nationale Supérieure d\u0026rsquo;Informatique pour l\u0026rsquo;Industrie et l\u0026rsquo;Entreprise (ENSIIE)) Wim Martens (University of Bayreuth) ","permalink":"https://ldbcouncil.org/gql-community/pgswg/","tags":null,"title":"Property Graph Schema Working Group (PGSWG)"},{"categories":null,"contents":"See our publications page.\n","permalink":"https://ldbcouncil.org/publications/","tags":null,"title":"Publications"}] \ No newline at end of file diff --git a/index.xml b/index.xml new file mode 100644 index 00000000..4de68a22 --- /dev/null +++ b/index.xml @@ -0,0 +1,5298 @@ + + + + Linked Data Benchmark Council + https://ldbcouncil.org/ + Recent content on Linked Data Benchmark Council + Hugo -- gohugo.io + en-us + &copy; Copyright LDBC 2024 + Tue, 27 Jun 2023 00:00:00 +0000 + + Announcing the Official Release of LDBC Financial Benchmark v0.1.0 + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + Tue, 27 Jun 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + <p>We are delighted to announce the official release of the initial version (v0.1.0) of <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench)</a>.</p> +<p>The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">LDBC FinBench Task Force</a>. The benchmark has one workload currently, <strong>Transaction Workload</strong>, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.</p> +<p>Compared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the <a href="https://ldbcouncil.org/benchmarks/finbench/finbench-talk-16th-tuc.pdf">slides</a> in the 16th TUC. The <a href="https://arxiv.org/pdf/2306.15975.pdf">Financial Benchmark&rsquo;s specification</a> can be found on arXiv.</p> +<p>The release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.</p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.</p> + + + + + Sixteenth TUC Meeting + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + Fri, 23 Jun 2023 09:00:00 -0800 + + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Oskar van Rest, Alastair Green, Gábor Szárnyas</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2023.sigmod.org/venue.shtml">SIGMOD 2023</a> on <strong>June 23-24 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10- and 15-minute talks followed by a Q&amp;A session. The talks will be recorded and made available online. <strong>If you would like to participate please register using <a href="https://forms.gle/T6bwVHzK9V5FaKyR9">our form</a>.</strong></p> +<p>LDBC will host a <strong>social event</strong> on Friday at the <a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a>.</p> +<p>In addition, AWS will host a <strong>Happy Hour</strong> (rooftop grill with beverages) on Saturday on the Amazon Nitro South building&rsquo;s 8th floor deck: <a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>.</p> +<h3 id="program">Program</h3> +<p><strong>All times are in PDT.</strong></p> +<h4 id="friday">Friday</h4> +<p><strong>Location:</strong> Hyatt Regency Bellevue on Seattle&rsquo;s Eastside, <strong>room Grand K</strong>, co-located with SIGMOD (<a href="https://www.hyatt.com/en-US/hotel/washington/hyatt-regency-bellevue-on-seattles-eastside/belle">900 Bellevue Way NE, Bellevue, WA 98004-4272</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>08:30</td> +<td>08:45</td> +<td>Oskar van Rest (Oracle)</td> +<td>LDBC – State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-ldbc-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/Frk7ITssaSY">video</a></td> +</tr> +<tr> +<td>08:50</td> +<td>09:05</td> +<td>Keith Hare (JCC / WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/keith-hare-an-update-on-the-gql-and-sql-pgq-standards-efforts.pdf">slides</a>, <a href="https://youtu.be/LQYkal_0j6E">video</a></td> +</tr> +<tr> +<td>09:10</td> +<td>09:25</td> +<td>Stefan Plantikow (Neo4j / WG3)</td> +<td>GQL - Introduction to a new query language standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/stefan-plantikow-gql-v1.pdf">slides</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Leonid Libkin (University of Edinburgh &amp; RelationalAI)</td> +<td>Formalizing GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/leonid-libkin-formalizing-gql.pdf">slides</a>, <a href="https://youtu.be/YZE1a00h1I4">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Semen Panenkov (JetBrains Research)</td> +<td>Mechanizing the GQL semantics in Coq – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/semyon-panenkov-gql-in-coq.pdf">slides</a>, <a href="https://youtu.be/5xBGohqWCzo">videos</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Oskar van Rest (Oracle)</td> +<td>SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-sql-property-graphs-in-oracle-database-and-oracle-graph-server-pgx.pdf">slides</a>, <a href="https://youtu.be/owM9WiQubpg">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (JCC)</td> +<td>LDBC&rsquo;s organizational changes and fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-ldbc-corporate-restructuring-and-fair-use-policies.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>Ioana Manolescu (INRIA)</td> +<td>Integrating Connection Search in Graph Queries – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ioana-manolescu-integrating-connection-search-in-graph-queries.pdf">slides</a>, <a href="https://youtu.be/LQPnmcrkUpY">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Maciej Besta (ETH Zurich)</td> +<td>Neural Graph Databases with Graph Neural Networks – <a href="https://youtu.be/ce5qNievRNs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:10</td> +<td>Longbin Lai (Alibaba Damo Academy)</td> +<td>To Revisit Benchmarking Graph Analytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/longbin-lai-benchmark-ldbc.pdf">slides</a>, <a href="https://youtu.be/s9Vtt-6t_FI">video</a></td> +</tr> +<tr> +<td>12:15</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>The World of Graph Databases from An Industry Perspective – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/yuanyuan-tian-world-of-graph-databases.pdf">slides</a>, <a href="https://youtu.be/AZuP_b95GPM">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Alin Deutsch (UC San Diego &amp; TigerGraph)</td> +<td>TigerGraph&rsquo;s Parallel Computation Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alin-deutsch-tigergraphs-computation-model.pdf">slides</a>, <a href="https://youtu.be/vcxdieJB80Y">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Chen Zhang (CreateLink)</td> +<td>Applications of a Native Distributed Graph Database in the Financial Industry – <a href="https://youtu.be/GCCT79Sps9I">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Ricky Sun (Ultipa)</td> +<td>Design of highly scalable graph database systems – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ricky-sun-ultipa.pdf">slides</a>, <a href="https://youtu.be/Sg1F64O4vGM">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Heng Lin (Ant Group)</td> +<td>The LDBC SNB implementation in TuGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-the-ldbc-snb-implementation-in-tugraph.pdf">slides</a>, <a href="https://youtu.be/fy8AuVerwnY">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>Shipeng Qi (Ant Group)</td> +<td>FinBench: The new LDBC benchmark targeting financial scenario – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/shipeng-qi-finbench.pdf">slides</a>, <a href="https://youtu.be/0xLZadDOfZk">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>17:00</td> +<td>host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>FinBench panel – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-finbench-panel.pdf">slides</a></td> +</tr> +<tr> +<td>19:00</td> +<td>22:00</td> +<td><em>dinner</em></td> +<td><em><a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a></em></td> +</tr> +</tbody> +</table> +<h4 id="saturday">Saturday</h4> +<p><strong>Location:</strong> Amazon Nitro South building, <strong>room 03.204</strong> (<a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:00</td> +<td>09:45</td> +<td>Brad Bebee (AWS)</td> +<td>Customers don&rsquo;t want a graph database, so why are we still here? – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/brad-bebee-tuc-keynote.pdf">slides</a>, <a href="https://youtu.be/bJlkpDC--fM">video</a></td> +</tr> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Fast and Efficient Update Handling for Graph H2TAP – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/muhammad-attahir-jibril-fast-and-efficient-update-handling-for-graph-h2tap.pdf">slides</a>, <a href="https://youtu.be/e8ZAszBsXV0">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Gabor Szarnyas (CWI)</td> +<td>LDBC Social Network Benchmark and Graphalytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-social-network-benchmark-and-graphalytics.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:30</td> +<td>Atanas Kiryakov and Tomas Kovachev (Ontotext)</td> +<td>GraphDB – Benchmarking against LDBC SNB &amp; SPB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tomas-kovatchev-atanas-kiryakov-benchmarking-graphdb-with-snb-and-spb.pdf">slides</a>, <a href="https://youtu.be/U6OPpNFOWqg">video</a></td> +</tr> +<tr> +<td>11:35</td> +<td>11:50</td> +<td>Roi Lipman (Redis Labs)</td> +<td>Delta sparse matrices within RedisGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/roi-lipman-delta-matrix.pdf">slides</a>, <a href="https://youtu.be/qfKsplV4Ihk">video</a></td> +</tr> +<tr> +<td>11:55</td> +<td>12:05</td> +<td>Rathijit Sen (Microsoft)</td> +<td>Microarchitectural Analysis of Graph BI Queries on RDBMS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/rathijit-sen-microarchitectural-analysis.pdf">slides</a>, <a href="https://youtu.be/55B8CkH09js">video</a></td> +</tr> +<tr> +<td>12:10</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td><em>on your own</em></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Alastair Green (JCC)</td> +<td>LEX &ndash; LDBC Extended GQL Schema – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-lex.pdf">slides</a>, <a href="https://youtu.be/DVpeb4Ce9Uw">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Ora Lassila (AWS)</td> +<td>Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ora-lassila-why-limit-yourself-to-lpg-when-you-can-do-rdf-too.pdf">slides</a>, <a href="https://youtu.be/7uAInoUwdds">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Jan Hidders (Birkbeck, University of London)</td> +<td>PG-Schema: a proposal for a schema language for property graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/jan-hidders-pg-schema.pdf">slides</a>, <a href="https://youtu.be/yQNL8hBTE4M">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Max de Marzi (RageDB and RelationalAI)</td> +<td>RageDB: Building a Graph Database in Anger – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/max-de-marzi-ragedb-building-a-graph-database-in-anger.pdf">slides</a>, <a href="https://youtu.be/LBbF8aslYFE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Umit Catalyurek (AWS)</td> +<td>HPC Graph Analytics on the OneGraph Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/umit-catalyurek-onegraph-hpc.pdf">slides</a>, <a href="https://youtu.be/64tv5LA6Wr8">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>David J. Haglin (Trovares)</td> +<td>How LDBC impacts Trovares – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/david-haglin-trovares.pdf">slides</a>, <a href="">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>16:25</td> +<td>Wenyuan Yu (Alibaba Damo Academy)</td> +<td>GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/wenyuan-yu-graphscope-flex.pdf">slides</a>, <a href="https://youtu.be/cRikoyDmMks">video</a></td> +</tr> +<tr> +<td>16:30</td> +<td>16:40</td> +<td>Scott McMillan (Carnegie Mellon University)</td> +<td>Graph processing using GraphBLAS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/scott-mcmillan-graph-processing-using-graphblas.pdf">slides</a>, <a href="https://youtu.be/yb4hGBhUzQQ">video</a></td> +</tr> +<tr> +<td>16:45</td> +<td>16:55</td> +<td>Tim Mattson (Intel)</td> +<td>Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tim-mattson-graphblas-and-tiledb.pdf">slides</a></td> +</tr> +<tr> +<td>17:00</td> +<td>20:00</td> +<td><em>happy hour (rooftop grill with beverages)</em></td> +<td><em>on the Nitro South building&rsquo;s 8th floor deck</em></td> +</tr> +</tbody> +</table> +<h4 id="tuc-event-locations">TUC event locations</h4> +<p>A <a href="https://www.google.com/maps/d/u/0/edit?mid=19_fi4fV-3-PZkNWCCcmhU86ct2EZXbgo">map of the LDBC TUC events</a> we hosted so far.</p> + + + + + LDBC SNB – Early 2023 updates + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + Wed, 15 Feb 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + <p>2023 has been an eventful year for us so far. Here is a summary of our recent activities.</p> +<ol> +<li> +<p>Our paper <a href="https://ldbcouncil.org/docs/papers/ldbc-snb-bi-vldb-2022.pdf">The LDBC Social Network Benchmark: Business Intelligence Workload</a> was published in PVLDB.</p> +</li> +<li> +<p>David Püroja just completed his MSc thesis on creating a design towards <a href="https://ldbcouncil.org/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf">SNB Interactive v2</a> at CWI&rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference&rsquo;s graph developer room titled <a href="https://fosdem.org/2023/schedule/event/graph_ldbc/">The LDBC Social Network Benchmark</a> (<a href="https://www.youtube.com/watch?v=YNF6z6gtXY4">YouTube mirror</a>).</p> +</li> +<li> +<p>I gave a lightning talk at FOSDEM&rsquo;s HPC developer room titled <a href="https://www.youtube.com/watch?v=q26DHnQFw54">The LDBC Benchmark Suite</a> (<a href="https://www.youtube.com/watch?v=q26DHnQFw54">YouTube mirror</a>).</p> +</li> +<li> +<p>Our auditors have successfully benchmark a number of systems:</p> +<ul> +<li>SPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze)</li> +<li>SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja)</li> +<li>SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr)</li> +</ul> +</li> +</ol> +<p>The results and the full disclosure reports are available under the <a href="https://ldbcouncil.org/benchmarks/spb/">SPB</a> and <a href="https://ldbcouncil.org/benchmarks/snb/">SNB benchmark pages</a>.</p> + + + + + LDBC SNB Datagen – The winding path to SF100K + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + Tue, 13 Sep 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + <p>LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">last technical update</a> on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark&rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.</p> +<h1 id="moving-to-sparksql">Moving to SparkSQL</h1> +<p>We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL&rsquo;s capabilites.</p> +<blockquote> +<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.</p> +</blockquote> +<p>Dealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup>, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators&rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:</p> +<ol> +<li>table manipulations related to shaping the output into the supported layouts and data types as set forth in the specification;</li> +<li>deriving the Interactive and BI datasets;</li> +<li>and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes.</li> +</ol> +<p>We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.</p> +<p><img src="datagen_df_0.png" alt="Datagen stages"></p> +<p>The diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there&rsquo;s no simple way to avoid<br> +it, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.</p> +<p>I&rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">previous blogpost in the series</a> or the <a href="https://arxiv.org/abs/2001.02299">Interactive benchmark specification</a>.</p> +<h1 id="transformation-pipeline">Transformation pipeline</h1> +<p>The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:</p> +<ul> +<li>explodes edges and / or attributes into separate tables,</li> +<li>subsets the snapshot part and creates insert / delete batches for the BI workload,</li> +<li>subsets the snapshot part for the Interactive workload,</li> +<li>applies formatting related options such as date time representation,</li> +<li>serializes the data to a Spark supported format (CSV, Parquet),</li> +</ul> +<p>We utilize a flexible data pipeline that operates on the graph.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span>, <span style="color:#66d9ef">M2</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">]</span> <span style="color:#a6e22e">extends</span> <span style="color:#f92672">(</span><span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">])</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">In</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> apply<span style="color:#f92672">(</span>v<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">])</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>v<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>The <code>Transform</code> trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let&rsquo;s see some of the transformations we have.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToBiTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">BI</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> keepImplicitDeletes<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.BI</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeEdges</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Therefore, a transformation pipeline may look like this:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>params<span style="color:#f92672">,</span> start<span style="color:#f92672">,</span> end<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>inputGraph<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>The <code>Graph</code> record has a <code>definition</code> field containing graph-global metadata, whereas <code>entities</code> holds the datasets keyed by their entity type. There are 3 graph <em>modes</em> currently: <code>Raw</code>, <code>Interactive</code> and <code>BI</code>. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the <code>Layout</code> dependent type, over which the entities are polymorphic.</p> +<p>It&rsquo;s important to understand that <code>Graph</code> holds <code>DataFrame</code>s, and these are lazily computed by Spark. So, <code>Graph</code> is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> isAttrExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> isEdgesExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> useTimestamp<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">M</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">Option</span><span style="color:#f92672">[</span><span style="color:#66d9ef">String</span><span style="color:#f92672">]]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> definition<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M</span><span style="color:#f92672">],</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">M</span><span style="color:#66d9ef">#</span><span style="color:#66d9ef">Layout</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">sealed</span> <span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Raw</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Interactive</span><span style="color:#f92672">(</span>bulkLoadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">BI</span><span style="color:#f92672">(</span>bulkloadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">,</span> batchPeriod<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">String</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">BatchedEntity</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>You may notice that <code>Transform</code> is statically typed w.r.t. <code>Mode</code>, however other properties, like <code>isAttrExploded</code>, or <code>isEdgesExploded</code> are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.</p> +<p>As we already mentioned, <code>Graph</code> is essentially a persistent container of <code>EntityType -&gt; DataFrame</code> mappings. <code>EntityType</code> can be <code>Node</code>, <code>Edge</code> and <code>Attr</code>, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.</p> +<p>Usually, a graph transformation involves matching entities based on their <code>EntityType</code>, and modifying the mapping (and if required, other metadata). Take, for example, the <code>ExplodeAttrs</code> transformation, which explodes into separate tables the values of two columns of <code>Person</code> stored as arrays:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">if</span> <span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#75715e">// assert at runtime that the transformation hasn&#39;t been applied yet +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#66d9ef">throw</span> <span style="color:#66d9ef">new</span> <span style="color:#a6e22e">AssertionError</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Attributes already exploded in the input graph&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> explodedAttr<span style="color:#f92672">(</span>attr<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Attr</span><span style="color:#f92672">,</span> node<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">DataFrame</span><span style="color:#f92672">,</span> column<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Column</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">=</span> +</span></span><span style="display:flex;"><span> attr <span style="color:#f92672">-&gt;</span> node<span style="color:#f92672">.</span>select<span style="color:#f92672">(</span>withRawColumns<span style="color:#f92672">(</span>attr<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>parent<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">),</span> explode<span style="color:#f92672">(</span>split<span style="color:#f92672">(</span>column<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;;&#34;</span><span style="color:#f92672">)).</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>attribute<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">)))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> modifiedEntities <span style="color:#66d9ef">=</span> input<span style="color:#f92672">.</span>entities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>collect <span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k <span style="color:#66d9ef">@</span> <span style="color:#a6e22e">Node</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person&#34;</span><span style="color:#f92672">,</span> <span style="color:#66d9ef">false</span><span style="color:#f92672">),</span> df<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#75715e">// match the Person node. This is the only one ExplodeAttrs should modify +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#a6e22e">Map</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Email&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;EmailAddress&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonEmailEmailAddress&#34; entity derived by exploding the email column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Speaks&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;Language&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonSpeaksLanguage&#34; entity derived by exploding the language column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> k <span style="color:#f92672">-&gt;</span> df<span style="color:#f92672">.</span>drop<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">)</span> <span style="color:#75715e">// drop the exploded columns from person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntities <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>entities<span style="color:#f92672">)(</span><span style="color:#66d9ef">_</span> <span style="color:#f92672">++</span> <span style="color:#66d9ef">_</span><span style="color:#f92672">)</span> <span style="color:#75715e">// merge-replace the modified entities in the graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntityDefinitions <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#f92672">(</span>e<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> e <span style="color:#f92672">++</span> v<span style="color:#f92672">.</span>map<span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> k <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Some</span><span style="color:#f92672">(</span>v<span style="color:#f92672">.</span>schema<span style="color:#f92672">.</span>toDDL<span style="color:#f92672">)</span> <span style="color:#f92672">}</span> <span style="color:#75715e">// update the entity definition schema to reflect the modifications +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> l <span style="color:#66d9ef">=</span> lens<span style="color:#f92672">[</span><span style="color:#66d9ef">In</span><span style="color:#f92672">]</span> <span style="color:#75715e">// lenses provide a terse syntax for modifying nested fields +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">(</span>l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>entities<span style="color:#f92672">).</span>set<span style="color:#f92672">(</span>input<span style="color:#f92672">)((</span><span style="color:#66d9ef">true</span><span style="color:#f92672">,</span> updatedEntityDefinitions<span style="color:#f92672">,</span> updatedEntities<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Note that <code>EntityType</code> does not hold the dataset&rsquo;s full SQL schema currently, as it&rsquo;s not useful for pattern matching, but can be accessed directly from <code>DataFrame</code> if needed.</p> +<h1 id="inputoutput">Input/output</h1> +<p>The <code>Reader</code> and <code>Writer</code> typeclasses are used to read from a <code>Source</code> and write to a <code>Sink</code> respectively, terminating a graph transformation pipeline<br> +on both ends.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">T</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> read<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> exists<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">S</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Data</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> write<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Data</span><span style="color:#f92672">,</span> sink<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">S</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Unit</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>There are implementations under <code>ldbc.datagen.io.instances</code> that read a graph from a <code>GraphSource</code> and write to a <code>GraphSink</code>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> source <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSource</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>read<span style="color:#f92672">(</span>source<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span><span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>graph<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> sink <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSink</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>write<span style="color:#f92672">(</span>transformedGraph<span style="color:#f92672">,</span> sink<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>We provide <a href="https://github.com/typelevel/simulacrum">Ops syntax</a> to make it shorter:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Reader.ops._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Writer.ops._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">).</span>read +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> <span style="color:#f92672">???</span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span>transformedGraph<span style="color:#f92672">.</span>write<span style="color:#f92672">(</span><span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">))</span> +</span></span></code></pre></div><p>The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.</p> +<p>Spark has a facility to derive SparkSQL schema from case classes automatically<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup>. We created case classes for each entity in the <code>Raw</code> dataset. We also created a typeclass <code>EntityTraits</code> associating these classes with their <code>EntityType</code>, so we can summon them (and consequently their SparkSQL schema) in the reader.</p> +<p>The case classes are used during the serialization of the generated dataset too, but more about that later.</p> +<h1 id="factor-generation">Factor generation</h1> +<p>As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.</p> +<p>We decided it&rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator&rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use <a href="https://spark.apache.org/graphx/">GraphX</a>, which was unimaginable with the previous design.</p> +<p>Factor tables are added by extending a map with a <code>name -&gt; Factor</code> pair. <code>Factor</code> declares is input entities, and accepts a function that receives input <code>DataFrames</code>, and returns a single <code>DataFrame</code> as output.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> factors <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Map</span> <span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;personDisjointEmployerPairs&#34;</span> <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Factor</span><span style="color:#f92672">(</span><span style="color:#a6e22e">PersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonKnowsPersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">OrganisationType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonWorkAtCompanyType</span><span style="color:#f92672">)</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">case</span> <span style="color:#a6e22e">Seq</span><span style="color:#f92672">(</span>person<span style="color:#f92672">,</span> personKnowsPerson<span style="color:#f92672">,</span> organisation<span style="color:#f92672">,</span> workAt<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> knows <span style="color:#66d9ef">=</span> undirectedKnows<span style="color:#f92672">(</span>personKnowsPerson<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> company <span style="color:#66d9ef">=</span> organisation<span style="color:#f92672">.</span>where<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;Type&#34;</span> <span style="color:#f92672">===</span> <span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">).</span>cache<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> personSample <span style="color:#66d9ef">=</span> person +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>orderBy<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>limit<span style="color:#f92672">(</span><span style="color:#ae81ff">20</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> personSample +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person2&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>knows<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;knows&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;knows.person2Id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>workAt<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;workAt&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;workAt.PersonId&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;knows.Person1id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>company<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;Company.id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;workAt.CompanyId&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>select<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2id&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.name&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyName&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyId&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.creationDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2creationDate&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.deletionDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2deletionDate&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>distinct<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">},</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* more factors */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span></code></pre></div><p>As you can see, it&rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. <code>undirectedKnows</code>). Currently, there&rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.</p> +<h1 id="revamping-the-data-generators-serializer">Revamping the data generator&rsquo;s serializer</h1> +<p>At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator&rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we&rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.</p> +<blockquote> +<p><a href="https://parquet.apache.org/">Parquet</a> is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.</p> +</blockquote> +<p>The new serialization framework is heavily influenced by the design of Java <code>OutputStreams</code>, in the sense that stateful objects are composed to form a pipeline. For example, in case of <em>activities</em>, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).</p> +<p><img src="activity.png" alt="Activity serialization pipeline"></p> +<p>The benefit of this architecture is that only the last component needs to change when we add support for a new output format.</p> +<p>To support Parquet, we made use of row-level serializers available in Hadoop&rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the <code>Raw</code> entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. <code>Forum</code>) and Spark&rsquo;s <code>Encoder</code> framework to encode the entities in Parquet, which means that the generated output remains consistent with <code>DataFrame</code>-based reader, and we spare a lot of code duplication.</p> +<h1 id="optimizations">Optimizations</h1> +<p>After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation<sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup>. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren&rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf3k_bi <span style="color:#ae81ff">3000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">330</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf10k_bi <span style="color:#ae81ff">10000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">1000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><p>The above examples working configurations for generating the 3K and 10K BI datasets. The <code>--sf-per-executor</code> option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The <code>--partitions</code> option controls the total number of partitions, and was calculated based on the number of persons using the formula <code>partitions = ceil(number_of_persons / block_size / 3)</code> to get a maximum of 3 blocks per partition.</p> +<h1 id="conclusion">Conclusion</h1> +<p>These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.</p> +<h1 id="footnotes">Footnotes</h1> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>Shameless plug: You can learn more on this from <a href="https://www.dataversity.net/case-study-deriving-spark-encoders-and-schemas-using-implicits/">another blogpost of mine</a>.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>The datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>The maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Fifteenth TUC Meeting + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + Fri, 17 Jun 2022 09:20:00 -0500 + + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2022.sigmod.org/venue.shtml">SIGMOD 2022</a> on <strong>June 17-18 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10-15 minute talks followed by a Q&amp;A session. The talks will be recorded and made available online.<br> +The tenative program is the following. <strong>All times are in EDT.</strong></p> +<p>We will have a social event on Friday at 17:30 at <a href="https://elvezrestaurant.com/">El Vez</a> (<a href="https://g.page/ElVezPhilly">Google Maps</a>).</p> +<h4 id="friday-pennsylvania-convention-centerhttpswwwpaconventioncom-room-204bhttps2022sigmodorgprogramshtml">Friday (<a href="https://www.paconvention.com/">Pennsylvania Convention Center</a>, <a href="https://2022.sigmod.org/program.shtml">room 204B</a>)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:20</td> +<td>09:30</td> +<td>Peter Boncz (LDBC/CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/39BoOIGk9Is">video</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Alastair Green (LDBC/Birkbeck)</td> +<td>LDBC&rsquo;s fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-fair-use-of-the-ldbc-trademark.pdf">slides</a>, <a href="https://youtu.be/7zmCysN4Rpg">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)</td> +<td>LDBC Social Network Benchmark: Business Intelligence workload v1.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/gabor-szarnyas-the-ldbc-social-network-benchmark-business-intelligence-workload.pdf">slides</a>, <a href="https://youtu.be/AJ96M8_njxE">video</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Heng Lin (Ant Group)</td> +<td>LDBC Financial Benchmark introduction – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/heng-lin-ldbc-financial-benchmark-introduction.pdf">slides</a>, <a href="https://youtu.be/iBhud_YjafY">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Chen Zhang (CreateLink)</td> +<td>New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/chen-zhang-new-ldbc-snb-benchmark-record-by-galaxybase-more-than-6-times-faster-and-70-percent-higher-throughput.pdf">slides</a>, <a href="https://youtu.be/sMzTsb8iw_Y">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>James Clarkson (Neo4j)</td> +<td>LDBC benchmarks: Promoting good science and industrial consumption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/james-clarkson-ldbc-benchmarks-promoting-good-science-and-industrial-consumption.pdf">slides</a>, <a href="https://youtu.be/VYG1mzcl9qQ">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Oskar van Rest (Oracle)</td> +<td>Creating and querying property graphs in Oracle, on-premise and in the cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oskar-van-rest-creating-and-querying-property-graphs-in-oracle-on-premise-and-in-the-cloud.pdf">slides</a>, <a href="https://youtu.be/2HX2Vixf2gs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:15</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>Conquering LDBC SNB BI at SF-10k – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/mingxi-wu-conquering-ldbc-snb-bi-at-sf10k.pdf">slides</a>, <a href="https://youtu.be/oJbqzQ_t3G8">video</a></td> +</tr> +<tr> +<td>12:20</td> +<td>13:20</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:20</td> +<td>13:35</td> +<td>Altan Birler (Technische Universität München)</td> +<td>Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/altan-birler-relational-databases-can-handle-graphs-too.pdf">slides</a>, <a href="https://youtu.be/cRgbdY3I2i4">video</a></td> +</tr> +<tr> +<td>13:40</td> +<td>13:55</td> +<td>David Püroja (CWI)</td> +<td>LDBC Social Network Benchmark: Interactive workload v2.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/david-puroja-ldbc-snb-interactive-workload-v2.0.pdf">slides</a></td> +</tr> +<tr> +<td>14:00</td> +<td>14:15</td> +<td>Angela Bonifati (Lyon 1 University)</td> +<td>The quest for schemas in graph databases – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/angela-bonifati-the-quest-for-schemas-in-graph-databases.pdf">slides</a>, <a href="https://youtu.be/VT7cx3Jp7V8">video</a></td> +</tr> +<tr> +<td>14:20</td> +<td>14:35</td> +<td>Matteo Lissandrini (Aalborg University)</td> +<td>Understanding graph data representations in triplestores – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/matteo-lissandrini-understanding-graph-data-representations-in-triplestores.pdf">slides</a>, <a href="https://youtu.be/xqVMJZfh_JU">video</a></td> +</tr> +<tr> +<td>14:40</td> +<td>14:55</td> +<td>Wim Martens (University of Bayreuth)</td> +<td>Path representations – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/wim-martens-path-representations.pdf">slides</a>, <a href="https://youtu.be/Ma-E5dwgf-E">video</a></td> +</tr> +<tr> +<td>15:00</td> +<td>15:20</td> +<td>Audrey Cheng (UC Berkeley)</td> +<td>TAOBench: An end-to-end benchmark for social network workloads – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/audrey-cheng-taobench.pdf">slides</a>, <a href="https://youtu.be/1p8AStxS3es">video</a></td> +</tr> +</tbody> +</table> +<h4 id="saturday-philadelphia-marriott-downtownhttpswwwmarriottcomen-ushotelsphldt-philadelphia-marriott-downtown-room-401-402-4th-floor">Saturday (<a href="https://www.marriott.com/en-us/hotels/phldt-philadelphia-marriott-downtown/">Philadelphia Marriott Downtown</a>, room 401-402, 4th floor)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Keith Hare (WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/keith-hare-property-graph-standards-process-and-timing.pdf">slides</a>, <a href="https://youtu.be/xFVD3LWnKlc">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>10:35</td> +<td>Leonid Libkin (ENS Paris)</td> +<td>Pattern matching in GQL and SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/leonid-libkin-pattern-matching-in-gql-and-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/OvGsa0qLANE">video</a></td> +</tr> +<tr> +<td>10:40</td> +<td>10:55</td> +<td>Petra Selmer (Neo4j/WG3)</td> +<td>An overview of GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/petra-selmer-towards-gql-v1-a-property-graph-query-language-standard.pdf">slides</a>, <a href="https://youtu.be/tncf2FgyIyo">video</a></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (LDBC/WG3)</td> +<td>GQL 2.0: A technical manifesto – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-gql-2.0-a-technical-manifesto.pdf">slides</a>, <a href="https://youtu.be/upIvpYy8C2g">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>George Fletcher (TU Eindhoven)</td> +<td>PG-Keys (LDBC Property Graph Schema Working Group) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/george-fletcher-pg-keys-keys-for-property-graphs.pdf">slides</a>, <a href="https://youtu.be/_W8-jOtcObc">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Arvind Shyamsundar (Microsoft)</td> +<td>Graph capabilities in Microsoft SQL Server and Azure SQL Database – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/arvind-shyamsundar-graph-capabilities-in-microsoft-sql-server-and-azure-database.pdf">slides</a>, <a href="https://youtu.be/xxV2BfZupGw">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>13:30</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Daniël ten Wolde (CWI)</td> +<td>Implementing SQL/PGQ in DuckDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/daniel-ten-wolde-implementing-sql-pgq-in-duckdb.pdf">slides</a>, <a href="https://youtu.be/JmSfU0BTH5w">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Oszkár Semeráth, Kristóf Marussy (TU Budapest)</td> +<td>Generation techniques for consistent, realistic, diverse, and scalable graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oszkar-semerath-generation-techniques-for-consistent-realistic-diverse-and-scalable-graphs.pdf">slides</a>, <a href="https://youtu.be/hB6j6mvh-vA">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Molham Aref (RelationalAI)</td> +<td>Graph Normal Form – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/molham-aref-graph-normal-form.pdf">slides</a>, <a href="https://youtu.be/-kP4Raqr5KA">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Naomi Arnold (Queen Mary University of London)</td> +<td>Temporal graph analysis of the far-right social network Gab – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/naomi-arnold-temporal-graph-analysis-of-the-far-right-social-network-gab.pdf">slides</a>, <a href="https://youtu.be/ugSkFlif4PE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:05</td> +<td>Domagoj Vrgoč (PUC Chile)</td> +<td>Evaluating path queries in MillenniumDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/domagoj-vrgoc-regular-path-queries-in-millenniumdb.pdf">slides</a>, <a href="https://youtu.be/_OzJ6vI7GNU">video</a></td> +</tr> +<tr> +<td>15:10</td> +<td>15:25</td> +<td>Pavel Klinov, Evren Sirin (Stardog)</td> +<td>Stardog&rsquo;s experience with LDBC – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/evren-sirin-stardog-experience-with-ldbc.pdf">slides</a>, <a href="https://youtu.be/CBrEeOTqGKM">video</a></td> +</tr> +</tbody> +</table> + + + + + Announcing the LDBC Financial Benchmark Task Force + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + Thu, 26 May 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + <p>We are delighted to announce the set up of the <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench) task force</a>.</p> +<p>The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.</p> +<p>Compared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.</p> +<p>The FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">Work Charter for FinBench</a></p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.</p> + + + + + Fourteenth TUC Meeting + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + Mon, 16 Aug 2021 16:00:00 +0200 + + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + <p>LDBC was hosting a one-day hybrid workshop, co-located with <a href="https://vldb.org/2021/">VLDB 2021</a> on <strong>August 16 (Monday) between 16:00–20:00 CEST</strong>.</p> +<p>The physical part of the workshop was held in room Akvariet 2 of the <a href="https://www.tivolihotel.com/">Tivoli Hotel</a> (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC&rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.</p> +<p>Talks were scheduled to be 10 minutes with a short Q&amp;A session. We had three sessions. Their schedules are shown below.</p> +<h4 id="16001725-cest-ldbc-updates-benchmarks-query-languages">[16:00–17:25 CEST] LDBC updates, benchmarks, query languages</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>16:00</td> +<td>Peter Boncz (CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a></td> +</tr> +<tr> +<td>16:05</td> +<td>Gábor Szárnyas (CWI)</td> +<td>Overview of LDBC benchmarks – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-benchmarks.pdf">slides</a></td> +</tr> +<tr> +<td>16:12</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>LDBC Social Network Benchmark results with TigerGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mingxi-wu-tigergraph-snb-preliminary-results.pdf">slides</a></td> +</tr> +<tr> +<td>16:24</td> +<td>Xiaowei Zhu (Ant Group)</td> +<td>Financial Benchmark proposal – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/xiaowei-zhu-financial-benchmark.pdf">slides</a></td> +</tr> +<tr> +<td>16:36</td> +<td>Petra Selmer (Neo4j)</td> +<td>Status report from the Existing Languages Working Group (ELWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/petra-selmer-elwg.pdf">slides</a>, <a href="https://youtu.be/I5A8VuFDhsA">video</a></td> +</tr> +<tr> +<td>16:48</td> +<td>Jan Hidders (Birkbeck)</td> +<td>Status report from the Property Graph Schema Working Group (PGSWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/jan-hidders-pgswg.pdf">slides</a>, <a href="https://youtu.be/iEbVi9T-HVk">video</a></td> +</tr> +<tr> +<td>17:00</td> +<td>Keith Hare (JCC Consulting)</td> +<td>Database Language Standards Structure and Process, SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/keith-hare-database-language-standards-structure-and-process-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/ZgFCuzods4g">video</a></td> +</tr> +<tr> +<td>17:12</td> +<td>Stefan Plantikow (GQL Editor)</td> +<td>Report on the GQL standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/stefan-plantikow-gql.pdf">slides</a>, <a href="https://youtu.be/z0pN5NwKsgc">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="17351845-cest-systems-and-data-structures">[17:35–18:45 CEST] Systems and data structures</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>17:35</td> +<td>Vasileios Trigonakis (Oracle Labs)</td> +<td>PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasileios-trigonakis-pgxd-adfs.pdf">slides</a>, <a href="https://youtu.be/cv2ZfWRBOek">video</a></td> +</tr> +<tr> +<td>17:47</td> +<td>Matthias Hauck (SAP)</td> +<td>JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/matthias-hauck-json-spatial-graph-sap-hana-cloud.pdf">slides</a>, <a href="https://youtu.be/dgpMJFho6Q8">video</a></td> +</tr> +<tr> +<td>17:59</td> +<td>Nikolay Yakovets (Eindhoven University of Technology)</td> +<td>AvantGraph – <a href="https://youtu.be/z0pN5NwKsgcttachments/nikolay-yakovets-avantgraph.pdf">slides</a>, <a href="https://youtu.be/9M9FOycovTw">video</a></td> +</tr> +<tr> +<td>18:11</td> +<td>Semih Salihoglu (University of Waterloo)</td> +<td>GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semih-salihoglu-graindb.pdf">slides</a>, <a href="https://youtu.be/FFK3y6vPHJs">video</a></td> +</tr> +<tr> +<td>18:23</td> +<td>Semyon Grigorev (Saint Petersburg University)</td> +<td>Context-free path querying: Obstacles on the way to adoption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semyon-grigorev-cfpq.pdf">slides</a>, <a href="https://youtu.be/pha1xIpEL3I">video</a></td> +</tr> +<tr> +<td>18:35</td> +<td>Per Fuchs (Technical University of Munich)</td> +<td>Sortledton: A universal, transactional graph data structure – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/per-fuchs-sortledton.pdf">slides</a>, <a href="https://youtu.be/33ZjsNN0hhU">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="1855-2000-cest-high-level-approaches-and-benchmarks">[18:55-20:00 CEST] High-level approaches and benchmarks</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>18:55</td> +<td>Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)</td> +<td>Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/angelos-christos-anadiotis-investigative-journalism-graph-data-management.pdf">slides</a>, <a href="https://youtu.be/a1VYjyec8dg">video</a></td> +</tr> +<tr> +<td>19:07</td> +<td>Vasia Kalavri (Boston University)</td> +<td>Learning to partition unbounded graph streams – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasia-kalavri-learning-to-partition-unbounded-graph-streams.pdf">slides</a>, <a href="https://youtu.be/PTlUABKWniA">video</a></td> +</tr> +<tr> +<td>19:19</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Towards a Hybrid OLTP-OLAP Graph Benchmark – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/muhammad-attahir-jibril-hybrid-oltp-olap-benchmark.pdf">slides</a>, <a href="https://youtu.be/tMBVszTSJXc">video</a></td> +</tr> +<tr> +<td>19:31</td> +<td>Riccardo Tommasini (University of Tartu)</td> +<td>An outlook on Benchmarks for Graph Stream Processing – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/riccardo-tommasini-graph-stream-processing-benchmarks.pdf">slides</a>, <a href="https://youtu.be/HabvJvPXsLc">video</a></td> +</tr> +<tr> +<td>19:43</td> +<td>Mohamed Ragab (University of Tartu)</td> +<td>Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mohamed-ragab-benchranking.pdf">slides</a>, <a href="https://youtu.be/mZ8LhGUq7Wg">video</a></td> +</tr> +</tbody> +</table> + + + + + Thirteenth TUC Meeting + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + Tue, 30 Jun 2020 14:00:00 +0000 + + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + <p>LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.</p> +<h3 id="snb-task-force">SNB Task Force</h3> +<ul> +<li>Progress report +<ul> +<li>ACID compliance test suite</li> +<li>Integrating deletions to Datagen</li> +<li>Migrating Datagen to Spark</li> +<li>Redesign of BI read queries</li> +<li>Extensions to the driver</li> +</ul> +</li> +<li>Ongoing work +<ul> +<li>Datagen: tuning the distribution of deletes</li> +<li>Interactive 2.0 workload</li> +<li>BI 1.0 workload</li> +</ul> +</li> +</ul> +<p>Zoom links will be sent through email.</p> + + + + + Speeding Up LDBC SNB Datagen + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + Fri, 12 Jun 2020 00:00:00 +0000 + + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + <p>LDBC&rsquo;s <a href="#references">Social Network Benchmark [4]</a> (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems&rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.</p> +<p>LDBC SNB provides <a href="https://github.com/ldbc/ldbc_snb_datagen">Datagen</a> (Data Generator), which produces synthetic datasets, mimicking a social network&rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).</p> +<h2 id="overview">Overview</h2> +<p>The benchmark&rsquo;s specification describes a social network <a href="https://github.com/ldbc/ldbc_snb_docs/blob/9253abbde94ec7eaccd366c5d4c15cca30752e36/figures/schema-comfortable.pdf">data model</a> which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see <a href="#references">References</a>.</p> +<p>Datasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).</p> +<p><img src="datagen_flow.png" alt=""> \ <em>Figure 1. LDBC SNB Datagen Process on Hadoop</em></p> +<p>In the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.</p> +<p><em>Note: The diagram shows the call sequence as implemented. All steps are sequential &ndash; including the relationship generation &ndash;, even in cases when the data dependencies would allow for parallelization.</em></p> +<p>Entities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup> A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this <em>edge as property</em> representation makes the data harder to handle in SQL than it would be with a flat join table.</p> +<p>Entity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup></p> +<p>Serialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.</p> +<h2 id="motivations-for-the-migration">Motivations for the migration</h2> +<p>The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:</p> +<ul> +<li> +<p><strong>Better memory utilization:</strong> MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.</p> +</li> +<li> +<p><strong>Smaller codebase:</strong> The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.</p> +</li> +<li> +<p><strong>Small entry cost:</strong> Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.</p> +</li> +<li> +<p><strong>Incremental improvements:</strong> Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.</p> +</li> +<li> +<p><strong>OSS, commodity:</strong> Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.</p> +</li> +</ul> +<h2 id="first-steps">First steps</h2> +<p>The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.</p> +<ul> +<li> +<p><strong>Use your memory:</strong> A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using <code>MEMORY_AND_DISK</code>). In short, the default caching strategy was used everywhere.</p> +</li> +<li> +<p><strong>Regression tests:</strong> Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.</p> +</li> +<li> +<p><strong>Thread-safety concerns:</strong> Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn&rsquo;t a concern in the original implementation due to the fact that MapReduce doesn&rsquo;t use thread-based parallelization for mappers and reducers.<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, <sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup> which makes it somewhat harder to find potential unguarded shared variables.</p> +</li> +</ul> +<h2 id="case-study-person-ranking">Case study: Person ranking</h2> +<p>Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the <a href="#references">S3G2 paper [3]</a>.</p> +<h3 id="the-original-mapreduce-version">The original MapReduce version</h3> +<p><img src="person_ranking.svg" alt=""> \ <em>Figure 2. Diagram of the MapReduce code for ranking persons</em></p> +<p>The implementation, shown in pseudocode above, works as follows:</p> +<ol> +<li>The equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low.</li> +<li>The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate &ldquo;side-channel&rdquo; file upon the completion of a reduce task.</li> +<li>In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person.</li> +</ol> +<p>Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.</p> +<h3 id="the-migrated-version">The migrated version</h3> +<p>Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.</p> +<h2 id="benchmarks">Benchmarks</h2> +<p>Benchmarks were carried out on AWS <a href="https://aws.amazon.com/emr/">EMR</a>, originally utilising <a href="https://aws.amazon.com/ec2/instance-types/i3/">i3.xlarge</a> instances because of their fast NVMe SSD storage and ample amount of RAM.</p> +<p>The application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.<sup id="fnref:5"><a href="#fn:5" class="footnote-ref" role="doc-noteref">5</a></sup> The MapReduce results were as follows:</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>16</td> +<td>1.60</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>34</td> +<td>1.13</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>40</td> +<td>1.20</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>44</td> +<td>1.32</td> +</tr> +</tbody> +</table> +<p>It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.</p> +<p><img src="mr_sf100_cpu_load.png" alt=""> <br> +<em>Figure 3. CPU Load for the Map Reduce cluster is bursty and less than<br> +50% on average (SF100, 2nd graph shows master)</em></p> +<p><img src="mr_sf100_mem_free.png" alt=""> <br> +<em>Figure 4. The job only starts to consume memory when already 10 minutes<br> +into the run (SF100, 2nd graph shows master)</em></p> +<p>Let&rsquo;s see how Spark fares.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>10</td> +<td>1.00</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>21</td> +<td>0.70</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>27</td> +<td>0.81</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>36</td> +<td>1.08</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +</tbody> +</table> +<p>A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn&rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.</p> +<p><img src="spark_sf100_cpu_load.png" alt=""> <br> +<em>Figure 5. Full CPU utilization for Spark (SF100, last graph shows<br> +master)</em></p> +<p><img src="spark_sf100_mem_free.png" alt=""> <br> +<em>Figure 6. Spark eats up memory fast (SF100, 2nd graph shows master)</em></p> +<p>i3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it <em>only</em> has a 300 GB SSD.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>16</td> +<td>0.48</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>21</td> +<td>0.63</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>26</td> +<td>0.78</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +<tr> +<td>10000</td> +<td>303</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<p>The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.</p> +<h2 id="next-steps">Next steps</h2> +<p>The next improvement is refactoring the serializers so they use Spark&rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.</p> +<p>As already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.</p> +<p>The Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for <a href="#references">generating delete operations [1]</a>. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!</p> +<h2 id="references">References</h2> +<p>[1] <a href="https://ldbcouncil.org/docs/papers/datagen-deletes-grades-nda-2020.pdf">Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark&rsquo;s Data Generator</a></p> +<p>[2] <a href="https://www.youtube.com/watch?v=ZQOLuCOOpSI">9th TUC Meeting &ndash; LDBC SNB Datagen Update &ndash; Arnau Prat (UPC)</a> - <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">slides</a></p> +<p>[3] <a href="https://research.vu.nl/en/publications/s3g2-a-scalable-structure-correlated-social-graph-generator">S3G2: a Scalable Structure-correlated Social Graph Generator</a></p> +<p>[4] <a href="https://arxiv.org/abs/2001.02299">The LDBC Social Network Benchmark</a></p> +<p>[5] <a href="https://ldbcouncil.org/">LDBC</a> - <a href="https://github.com/ldbc">LDBC GitHub organization</a></p> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>Also makes it easier to map to a tabular format thus it is a SQL friendly representation.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>It&rsquo;s hard to imagine this done declaratively in SQL.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>Instead, multiple YARN containers have to be used if you want to parallelize on the same machine.&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>Although editors usually render these using different font styles.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:5"> +<p>With the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.&#160;<a href="#fnref:5" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Twelfth TUC Meeting + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + Fri, 05 Jul 2019 08:30:00 +0100 + + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + <p>LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event on the last Friday of <strong><a href="https://sigmod2019.org/">SIGMOD/PODS 2019</a></strong> in Amsterdam, The Netherlands, in the conference venue of <strong><a href="http://sigmod2019.org/conf_venue">Beurs van Berlage</a></strong>. The room is the Mendes da Silva kamer. Please check its tips for <strong><a href="http://sigmod2019.org/accommodation">accommodation in Amsterdam</a></strong>.</p> +<p>Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2019">GRADES-NDA 2019</a>, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.</p> +<p><strong>Talk proposals can be sent to Peter Boncz</strong>, who is also the local organizer. <strong>Please also send your slides to this email for archiving on this site.</strong></p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting, there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).</p> +<p>The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (<strong>room: Mendes da Silva kamer</strong>):</p> +<p>08:30-10:30 LDBC Board Meeting (non-public)</p> +<p>10:30-11:00 Coffee</p> +<p>11:00-12:45 Session 1: Graph Benchmarks</p> +<ul> +<li> +<p>11:00-11:05 Welcome &amp; introduction</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230404.pdf">11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706117.pdf">11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706118.pdf">12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706130.pdf">12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics</a></p> +</li> +</ul> +<p>12:45-14:00 Lunch</p> +<p>14:00-16:05 Session 2: Graph Query Languages</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706120.pdf">14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706121.pdf">14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706122.pdf">report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706119.pdf">14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706129.pdf">15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph&rsquo;s query language</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230401.pdf">15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language</a></p> +</li> +</ul> +<p>16:05-16:30 Coffee</p> +<p>16:30-17:50 Session 3: Graph System Performance</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111968258.pdf">16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706124.pdf">16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity</a> <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706116.pptx">pptx</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706128.pdf">17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706133.pdf">17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data</a></p> +</li> +</ul> +<p>If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.</p> + + + + + Eleventh TUC Meeting + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + Fri, 08 Jun 2018 08:30:00 -0500 + + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + <p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event preceding the <a href="https://sigmod2018.org/">SIGMOD/PODS 2018</a> conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2018/">GRADES-NDA 2018</a> as well, so you might combine travel.</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (<a href="mailto:boncz@cwi.nl">boncz@cwi.nl</a>) and Larri (<a href="mailto:larri@ac.upc.ed">larri@ac.upc.edu</a>). Local organizer is Juan Sequeda (<a href="mailto:juanfederico@gmail.com">juanfederico@gmail.com</a>).</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00:</p> +<ul> +<li> +<p>10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090478.pdf">10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo</a></p> +</li> +<li> +<p>11:00-11:30 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090466.pdf">11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090463.pdf">11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090472.pdf">12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI</a></p> +</li> +<li> +<p>12:45-14:00 lunch</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090474.pdf">14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090481.pdf">14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090485.pdf">14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service</a></p> +</li> +<li> +<p>15:15-15:40 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99811329.pdf">15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99287041.pdf">16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99745793.pdf">16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph</a></p> +</li> +<li> +<p><a href="http://relational.ai/">16:55-17:20 Molham Aref (relational.ai)</a>) - Introducing.. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99418113.pdf">relational.ai</a></p> +</li> +<li> +<p>18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701</p> +</li> +</ul> +<h3 id="location">Location</h3> +<p>The TUC will be held at the <a href="https://www.cs.utexas.edu/">University of Texas at Austin, Department of Computer Science</a> in the <a href="https://www.google.com/maps/place/The+University+of+Texas:+Department+of+Computer+Science/@30.2860955,-97.737582,18z/data=!4m5!3m4!1s0x0:0x12edecc8226b3241!8m2!3d30.2862279!4d-97.7365348">Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712</a> Room: GDC 6.302</p> +<p>The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.</p> +<h3 id="from-austin-to-sigmodpods-houston-on-saturday-june-9">From Austin to SIGMOD/PODS (Houston) on Saturday June 9</h3> +<p>Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.</p> +<h4 id="bus">Bus</h4> +<p>One option is to take a <a href="https://us.megabus.com/journey-planner/journeys?days=1&amp;concessionCount=0&amp;departureDate=2018-06-09&amp;destinationId=318&amp;inboundOtherDisabilityCount=0&amp;inboundPcaCount=0&amp;inboundWheelchairSeated=0&amp;nusCount=0&amp;originId=320&amp;otherDisabilityCount=0&amp;pcaCount=0&amp;totalPassengers=1&amp;wheelchairSeated=0">MegaBus that departs from downtown Austin and arrives at downtown Houston</a>.</p> +<p>There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).</p> +<p>If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).</p> + + + + + Tenth TUC Meeting + https://ldbcouncil.org/event/tenth-tuc-meeting/ + Fri, 01 Sep 2017 10:30:00 +0100 + + https://ldbcouncil.org/event/tenth-tuc-meeting/ + <p>This will be a one-day event at the <a href="http://www.vldb.org/2017">VLDB 2017</a> conference in Munich, Germany on September 1, 2017.</p> +<p>Topics and activities of interest in these TUC meetings are:</p> +<ul> +<li>Presentation on graph data management usage scenarios.</li> +<li>Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at <a href="mailto:adiaz@ac.upc.edu">adiaz@ac.upc.edu</a> to register; registration is free, but required.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00</p> +<p>10:30-12:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87588865.pdf">Peter Boncz (CWI): GraphQL task force update - the G-CORE proposal</a> (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868018.pptx">pptx</a>)</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868008.pdf">Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload</a></li> +<li>Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868014.pdf">LDBC Graphalytics v0.9</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868013.pdf">Graphalytics Global Competition and Graphalytics Custom Benchmark</a></li> +</ul> +<p>12:00-13:30: lunch break</p> +<p>13:30-15:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868024.pdf">Arnau Prat (UPC): Datasynth: Democratizing property graph generation</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868026.pdf">Marcus Paradies (SAP): SAP HANA GraphScript</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031809.pdf">Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform</a></li> +<li>Gaétan Hains (Huawei): Cost semantics for graph queries</li> +</ul> +<p>15:00-15:30: break</p> +<p>15:30-17:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031812.pdf">Petra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87195650.pdf">Markus Kaindl (Springer): SN SciGraph &ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain</a></li> +<li>Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks</li> +<li>Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset</li> +</ul> +<p>Speakers should aim for a <strong>20-minute talk</strong>.</p> +<p>Further:</p> +<ul> +<li>on Friday evening (19:00-21:00) there will be a <strong>social dinner</strong> at <a href="https://www.loewenbraeukeller.com/en/pub-and-beer-garden/">Löwenbräukeller</a>, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).</li> +<li>on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.</li> +</ul> +<h3 id="venue">Venue</h3> +<p>The Technical University of Munich (TUM) is hosting that week the <a href="http://www.vldb.org/2017">VLDB conference</a>; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.</p> +<p>The TUC meeting will be held in in <strong>Room 2607</strong> alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).</p> +<p><strong>address: Technische Universität München (TUM), Arcisstraße 21, 80333 München</strong></p> +<p><a href="https://www.google.nl/maps/place/Technische+Universit%C3%A4t+M%C3%BCnchen/@48.14966,11.5656715,17z/data=!3m1!4b1!4m5!3m4!1s0x479e7261336d8c11:0x79a04d44dc5bf19d!8m2!3d48.14966!4d11.5678602?hl=en">Google Maps</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920002.jpg" alt=""><br> +<img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920003.jpg" alt=""></p> + + + + + Ninth TUC Meeting + https://ldbcouncil.org/event/ninth-tuc-meeting/ + Thu, 09 Feb 2017 15:07:18 -0400 + + https://ldbcouncil.org/event/ninth-tuc-meeting/ + <p>LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> in Walldorf, Germany on February 9+10, 2017.</p> +<p>This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>;</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.</p> +<p>Thursday evening (19:00-21:00) there will be a <strong>social dinner</strong> in Heidelberg.</p> +<p>Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.</p> +<h4 id="social-dinner">Social Dinner</h4> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235334.png" alt=""></p> +<p><strong>Address: Hauptstraße 217, 69117 Heidelberg</strong><br> +<strong>Time: 19:00 / 7pm</strong></p> +<p>(See attachments at the bottom of the page)</p> +<h5 id="thursday">Thursday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>Welcome and logistics - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>9:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235329.pdf">Intro + state of the LDBC - Josep Lluis Larriba Pey</a> (UPC)</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235338.pdf">LDBC Graph QL task force</a> - Hannes Voigt (TU Dresden)</td> +</tr> +<tr> +<td>9:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235335.pdf">PGQL Status Update and Comparison to LDBC&rsquo;s Graph QL proposals</a> - Oskar van Rest (Oracle Labs)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628546.pdf">Adding shortest-paths to MonetDB</a> - Dean de Leo (CWI)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431939.pdf">Evolving Cypher for processing multiple graphs</a> - Stefan Plantikow (Neo Technology)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235346.pdf">Standardizing Graph Database Functionality - An Invitation to Collaborate</a> - Jan Michels (ISO/ANSI SQL, Oracle)&quot;</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235343.pdf">Dgraph: Graph database for production environment</a> - Tomasz Zdybal (Dgraph.io)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431945.pdf">LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap</a> - Alexandru Iosup (TU Delft)</td> +</tr> +<tr> +<td>13:20</td> +<td>LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)</td> +</tr> +<tr> +<td>13:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">LDBC SNB Datagen Update</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431943.pdf">LDBC SNB Business Intelligence Workload: Chokepoint Analysis</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431947.pdf">LDBC Benchmark Cost Specification</a> (+discussion) - Moritz Kaufmann (TU Munich)</td> +</tr> +<tr> +<td>14:40</td> +<td>coffee break</td> +</tr> +<tr> +<td>15:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76316673.pdf">EYWA: the Distributed Graph Engine in Huawei MIND Platform</a> (Yinglong Xia)</td> +</tr> +<tr> +<td>15:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431949.pdf">Graph Processing in SAP HANA</a> - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>15:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628563.pdf">Distributed Graph Analytics with Gradoop</a> - Martin Junghanns (Univ Leipzig)</td> +</tr> +<tr> +<td>16:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152834.pdf">Distributed graph flows: Cypher on Flink and Gradoop</a> - Max Kießling (Neo Technology)</td> +</tr> +<tr> +<td>16:30</td> +<td>closing - Peter Boncz</td> +</tr> +<tr> +<td>17:30</td> +<td>end</td> +</tr> +</tbody> +</table> +<h5 id="friday">Friday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>welcome - Peter Boncz</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152833.pdf">Graph processing in obi4wan</a> - Frank Smit (OBI4WAN)</td> +</tr> +<tr> +<td>9:40</td> +<td>Graph problems in the space domain - Albrecht Schmidt (ESA)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75792387.pdf">Medical Ontologies for Healthcare</a> - Michael Neumann (SAP)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76447745.pdf">The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries</a> - Gabor Szarnyas (BME)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76021761.pdf">Efficient sparse matrix computations and their generalization to graph computing applications</a> - Albert-Jan Yzelman (Huawei)</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152837.pdf">Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge</a> - Atanas Kyriakov (Ontotext)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td>LDBC Board of Directors Meeting</td> +</tr> +<tr> +<td>17:00</td> +<td>end</td> +</tr> +</tbody> +</table> +<h3 id="logistics">Logistics</h3> +<h5 id="important-things-to-know"><strong>Important things to know</strong></h5> +<p>The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">link</a></p> +<h5 id="venue"><strong>Venue</strong></h5> +<p>The TUC meeting will be held in the <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> at the SAP Guesthouse Kalipeh (<a href="https://www.kalipeh.com">https://www.kalipeh.com</a>). The address is:</p> +<p><strong>WDF 44 / SAP Guesthouse Kalipeh<br> +Dietmar-Hopp-Allee 15<br> +69190 Walldorf<br> +Germany</strong></p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p><a href="https://www.google.com/maps/place/SAP+Guesthouse+Kalipeh/@49.2951903,8.6436224,17z/data=!3m1!4b1!4m5!3m4!1s0x4797bea343a566af:0xd70698f3503ab74b!8m2!3d49.2951868!4d8.6458111">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/69042180.png" alt=""></p> +<h4 id="getting-there"><strong>Getting there</strong></h4> +<h5 id="by-plane"><strong>By plane</strong></h5> +<p>There are two airports close to SAP&rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.</p> +<p>When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.</p> +<p>The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).</p> +<p>Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).</p> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<p><strong>Traveling from Frankfurt Airport (FRA) to SAP Headquarters:</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.&rdquo;</li> +<li>Follow the A5 to &ldquo;Basel/Karlsruhe/Heidelberg.&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<p>(Should you use a navigational system which does not recognize the street name &lsquo;Dietmar-Hopp-Allee&rsquo; please use &lsquo;Neurottstrasse&rsquo; instead.)</p> +<p><strong>Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:</strong></p> +<p>To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.</p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A8/Stuttgart/B27.&rdquo;</li> +<li>Stay on A8 and follow the sign for &ldquo;Karlsruhe/Heilbronn/Singen/A8.&rdquo;</li> +<li>Follow A8 to Karlsruhe.</li> +<li>Take exit 41 &ndash; &ldquo;Dreieck Karlsruhe&rdquo; to merge onto A5 toward &ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<h6 id="parking"><strong>Parking</strong></h6> +<p>The closest parking lot to the event location is P7 (see figure above).</p> +<h5 id="by-train"><strong>By Train</strong></h5> +<p>As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.</p> +<p><strong>From Frankfurt Airport (FRA) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to Terminal 1, level T (see overview in Appendix).</li> +<li>Go to the AIRail Terminal &ndash; &ldquo;Fernbahnhof&rdquo; (long-distance trains).</li> +<li>Choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP.&rdquo; It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> +<p><strong>From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to the S-Bahn station in the airport, following the sign (station is called &ldquo;Stuttgart Flughafen/Messe&rdquo;).</li> +<li>Take train number S2 or S3 to &ldquo;Stuttgart Hauptbahnhof&rdquo; (main station).</li> +<li>From Stuttgart Hauptbahnhof choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP&rdquo;. It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> + + + + + LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + Tue, 06 Sep 2016 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + <p>LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.</p> +<p>LDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.</p> +<p>Tim Hegeman of <a href="https://www.tudelft.nl">TU Delft</a> is today presenting the technical paper describing LDBC Graphalytics at the important <a href="https://www.vldb.org/conference.html">VLDB</a> (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.</p> +<p>LDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.</p> +<p>Learn more: [/ldbc-graphalytics](LDBC Graphalytics)</p> +<p>GitHub: <a href="https://github.com/tudelft-atlarge/graphalytics">https://github.com/tudelft-atlarge/graphalytics</a></p> + + + + + Eighth TUC Meeting + https://ldbcouncil.org/event/eighth-tuc-meeting/ + Wed, 22 Jun 2016 14:45:20 -0400 + + https://ldbcouncil.org/event/eighth-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event/eighth-tuc-meeting/attachments at <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a> in Redwood Shores facility on <strong>Wednesday and Thursday June 22-23, 2016</strong>.</p> +<p>This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:</p> +<ul> +<li>Two day event/eighth-tuc-meeting/attachments with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify Oracle security in advance, registration requests need to be in by <strong>June 12</strong>.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +<li><a href="#accommodation">Accommodation</a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.</p> +<h6 id="wednesday-22th-of-june-2016-room-203"><strong>Wednesday, 22th of June 2016 (<strong>Room 203)</strong></strong></h6> +<p>(full morning: LDBC Board of Directors meeting)</p> +<ul> +<li>12:00 - 13:00 Lunch (provided)</li> +<li>13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome.</li> +<li>13:30 - 14:00 Peter Boncz (CWI) <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133891.pdf">LDBC introduction and status update</a>.</li> +<li>14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</li> +<li>14:00 Arnau Prat (DAMA-UPC). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133902.pdf">Social Network Benchmark, Interactive workload</a>.</li> +<li>14:30 Tim Hegeman (TU Delft). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133893.pdf">Social Network Benchmark, Analytics workload</a>.</li> +<li>15:00 - 15:30 Coffee break</li> +<li>15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133897.pdf">Graphing Healthcare Networks: Data, Analytics, and Use Cases.</a></li> +<li>16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133901.pdf">Frappé: Querying and managing evolving code dependency graphs</a>.</li> +<li>16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133895.pdf">UniProt: challenges of a public SPARQL endpoint.</a></li> +</ul> +</li> +<li>17:00 - 18:30 Graph Technologies (chair Peter Boncz) +<ul> +<li>17:00 Eugene I. Chong (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133904.pdf">Balancing Act to improve RDF Query Performance in Oracle Database</a>.</li> +<li>17:30 Lijun Chang (University of New South Wales). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133906.pdf">Efficient Subgraph Matching by Postponing Cartesian Products</a>.</li> +<li>18:00 Weining Qian (East China Normal University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133908.pdf">On Statistical Characteristics of Real-Life Knowledge Graphs</a>.</li> +</ul> +</li> +</ul> +<h6 id="thursday-23th-of-june-2016-room-203"><strong>Thursday, 23th of June 2016 (Room 203)</strong></h6> +<ul> +<li>08:00 - 09:00 Breakfast (provided)</li> +<li>09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) +<ul> +<li>09:00 Peter Boncz (CWI). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133896.pdf">Query Language Task Force status</a></li> +<li>09:45 Marcus Paradies (SAP). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297729.pdf">Social Network Benchmark, Business Intelligence workload</a></li> +</ul> +</li> +<li>10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) +<ul> +<li>10:00 Sergey Edunov (Facebook). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297731.pdf">Generating realistic trillion-edge graphs</a></li> +<li>10:30 George Fletcher (TU Eindhoven). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297733.pdf">An open source framework for schema-driven graph instance and graph query workload generation</a>.</li> +<li>11:00 Yinglong Xia (Huawei Research America): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297735.pdf">An Efficient Big Graph Analytics Platform</a>.</li> +<li>11:30 Zhe Wu (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297737.pdf">Bridging RDF Graph and Property Graph Data Models</a></li> +</ul> +</li> +<li>12:00 - 13:30 Lunch (provided)</li> +<li>13:30 - 15:30 Graph Technologies (chair Arnau Prat) +<ul> +<li>13:30 Tobias Lindaaker (Neo Technology). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297740.pdf">An open standard for graph queries: the Cypher contribution</a></li> +<li>14:00 Arash Termehchy (Oregon State University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297742.pdf">Toward Representation Independent Graph Querying &amp; Analytics</a></li> +<li>14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297745.pdf">In the service of the federation</a></li> +<li>15:00 Nandish Jayaram (Pivotal). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297747.pdf">Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs</a>.</li> +</ul> +</li> +<li>15:30 - 16:00 Coffee break</li> +<li>16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>16:00 Jans Aasman (Franz Inc.). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428806.pdf">Semantic Data Lake for Healthcare</a></li> +<li>16:15 Kevin Madden (Tom Sawyer Software). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428812.pdf">Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis</a></li> +<li>16:45 Juan Sequeda (Capsenta). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428810.pdf">Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources</a></li> +<li>17:15 Kevin Wilkinson (Hewlett Packard Labs). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428808.pdf">LDBC SNB extensions</a></li> +</ul> +</li> +<li>17:45 - 18:15 Closing discussion</li> +</ul> +<h6 id="friday-24th-of-june-2016-room-105"><strong>Friday, 24th of June 2016 (Room 105)</strong></h6> +<p>At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (<strong>GRADES16</strong>).</p> +<p>18:30 social dinner for GRADES registrants (place to be announced)</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>22nd and 23rd June 2016</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a></p> +<p>The address is:</p> +<p><strong>Room 203 (Wed-Thu) &amp; Room 105 (Fri)</strong><br> +<strong>Oracle Conference Center</strong><br> +<strong>350 Oracle Parkway</strong><br> +<strong>Redwood City, CA 94065, USA</strong></p> +<p><strong>Maps and situation</strong></p> +<p><a href="https://www.google.com/maps/place/Oracle+Conference+Center/@37.5322827,-122.2667034,17z/data=!3m1!4b1!4m2!3m1!1s0x808f98b5450e8ca3:0xdc75e8b1c02bbb91">Google Maps link</a></p> +<p>Oracle Campus map:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/40927234.jpg" alt=""></p> +<h5 id="getting-there"><strong>Getting there</strong></h5> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<ul> +<li>[Southbound] <strong>-</strong> Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +<li>[Northbound] <strong>-</strong> Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +</ul> +<h5 id="parking"><strong>Parking</strong></h5> +<p>The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.</p> +<h5 id="public-transport"><strong>Public transport</strong></h5> +<p>Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.</p> +<ul> +<li>Caltrain timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/weekdaytimetable.html</a></li> +<li>Oracle Shuttle timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html</a></li> +</ul> +<p>You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.</p> +<p>Alternatively, SamTrans (San Mateo County&rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.</p> + + + + + LDBC and Apache Flink + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + Mon, 16 Nov 2015 14:47:00 +0000 + + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + <p>Apache Flink <a href="#references">[1]</a> is an open source platform for distributed stream and batch data processing. Flink&rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.</p> +<p><img src="https://flink.apache.org/img/flink-stack-small.png" alt=""></p> +<p>Flink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.</p> +<p>The following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">&gt;</span> text <span style="color:#f92672">=</span> env<span style="color:#f92672">.</span><span style="color:#a6e22e">fromElements</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the past controls the future.&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the present controls the past.&#34;</span><span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>Tuple2<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">,</span> Integer<span style="color:#f92672">&gt;&gt;</span> wordCounts <span style="color:#f92672">=</span> text +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">flatMap</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> LineSplitter<span style="color:#f92672">())</span> <span style="color:#75715e">// splits the line and outputs (word,1) +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>tuples<span style="color:#f92672">.</span><span style="color:#a6e22e">groupBy</span><span style="color:#f92672">(</span><span style="color:#ae81ff">0</span><span style="color:#f92672">)</span> <span style="color:#75715e">// group by word +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">.</span><span style="color:#a6e22e">sum</span><span style="color:#f92672">(</span><span style="color:#ae81ff">1</span><span style="color:#f92672">);</span> <span style="color:#75715e">// sum the 1&#39;s +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>wordCounts<span style="color:#f92672">.</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop <a href="#references">[2]</a>. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing <a href="#references">[3]</a>. Using the class <code>LDBCToFlink</code>, LDBC output files can be read directly from HDFS or from the local file system:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;hdfs:///ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> <span style="color:#75715e">// or &#34;/path/to/social_network&#34; +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> vertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> edges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The tuple classes <code>LDBCVertex</code> and <code>LDBCEdge</code> hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.</p> +<p>Each <code>LDBCVertex</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all vertices * a vertex label (e.g. <code>Person</code>, <code>Comment</code>) * a key-value map of properties including also multivalued properties<br> +(e.g. <code>Person.email</code>)</li> +</ul> +<p>Each <code>LDBCEdge</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all edges</li> +<li>an edge label (e.g. <code>knows</code>, <code>likes</code>)</li> +<li>a source vertex identifier</li> +<li>a target vertex identifier</li> +<li>a key-value map of properties</li> +</ul> +<p>The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label <code>Person</code> and edges with the label <code>knows</code> and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub <a href="#references">[4]</a>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter vertices with label “Person” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> ldbcVertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">VERTEX_CLASS_PERSON</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter edges with label “knows” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> ldbcEdges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">EDGE_CLASS_KNOWS</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly vertices suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> vertices <span style="color:#f92672">=</span> ldbcVertices<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly edges suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Edge<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;&gt;</span> edges <span style="color:#f92672">=</span> ldbcEdges<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>Graph<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;</span> g <span style="color:#f92672">=</span> Graph<span style="color:#f92672">.</span><span style="color:#a6e22e">fromDataSet</span><span style="color:#f92672">(</span>vertices<span style="color:#f92672">,</span> edges<span style="color:#f92672">,</span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// run connected components on the subgraph for 10 iterations +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> components <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> g<span style="color:#f92672">.</span><span style="color:#a6e22e">run</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> ConnectedComponents<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// print the component id of the first 10 vertices +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>components<span style="color:#f92672">.</span><span style="color:#a6e22e">first</span><span style="color:#f92672">(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">).</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The ldbc-flink-import tool is available on Github <a href="#references">[3]</a> and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.</p> +<p>If you want to learn more about Apache Flink, a good starting point is the main documentation <a href="#references">[5]</a> and if you have any question feel free to ask the official mailing lists.<br> +There is also a nice set of videos <a href="#references">[6]</a> available from the latest Flink Forward conference.</p> +<h4 id="references">References</h4> +<p>[1] <a href="http://flink.apache.org/">http://flink.apache.org/</a></p> +<p>[2] <a href="https://github.com/dbs-leipzig/gradoop">https://github.com/dbs-leipzig/gradoop</a></p> +<p>[3] <a href="https://github.com/s1ck/ldbc-flink-import">https://github.com/s1ck/ldbc-flink-import</a></p> +<p>[4] <a href="https://gist.github.com/s1ck/b33e6a4874c15c35cd16">https://gist.github.com/s1ck/b33e6a4874c15c35cd16</a></p> +<p>[5] <a href="https://ci.apache.org/projects/flink/flink-docs-release-0.10/">https://ci.apache.org/projects/flink/flink-docs-release-0.10/</a></p> +<p>[6] <a href="https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA">https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA</a></p> + + + + + Seventh TUC Meeting + https://ldbcouncil.org/event/seventh-tuc-meeting/ + Mon, 09 Nov 2015 14:17:30 -0400 + + https://ldbcouncil.org/event/seventh-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="http://www.research.ibm.com/labs/watson">IBM&rsquo;s TJ Watson</a> facility on <strong>Monday and Tuesday November 9/10, 2015.</strong></p> +<p>This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify IBM security in advance, registration requests need to be in by Nov 1.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a><br> +- <a href="#date"><strong>Date</strong></a><br> +- <a href="#venue"><strong>Venue</strong></a><br> +- <a href="#maps-and-situation"><strong>Maps and situation</strong></a><br> +- <a href="#getting-there"><strong>Getting there</strong></a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>Monday, 9th of November 2015</strong></p> +<p>8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)</p> +<p>9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)</p> +<p>9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</p> +<p>9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload</p> +<p>10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload</p> +<p>10:30-11:00 Coffee break</p> +<p>11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)</p> +<p>11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.</p> +<p>11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.</p> +<p>12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)</p> +<p>14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox</p> +<p>14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today&rsquo;s architectures</p> +<p>15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G</p> +<p>15:30-16:00 Coffee break</p> +<p>16:00 - 17:00 Technologies (chair Irini Fundulaki)</p> +<p>16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store</p> +<p>16:30 David Ediger (GeorgiaTech). STINGER</p> +<p>17:00 Gary King (Franz Inc.). AllegroGraph&rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties</p> +<p>17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics</p> +<p>18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase</p> +<p>19:00 Social dinner</p> +<p><strong>Tuesday 10th November 2015</strong></p> +<p>9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)</p> +<p>9:00 Philip Rathle (Neo). On openCypher</p> +<p>9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification</p> +<p>9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions</p> +<p>10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation</p> +<p>10:30 - 11:00 Coffee break</p> +<p>11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)</p> +<p>11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL</p> +<p>11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,</p> +<p>11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis</p> +<p>12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>9th and 10th November 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the IBM Thomas J Watson Research Center.<br> +The address is:</p> +<p><strong>IBM Thomas J Watson Research Center</strong><br> +<strong>1101 Kitchawan Rd,</strong><br> +<strong>Yorktown Heights, NY 10598, USA</strong></p> +<p>If you are using a <em>GPS system</em>, please enter <strong>&ldquo;200 Aqueduct Road, Ossining NY, 10562&rdquo;</strong> for accurate directions to the lab entrance. You may also want to check the routing online.</p> +<p>The meeting will take place in the <em>Auditorium</em> on November 9th, and in Meeting Room <em>20-043</em> on November 10th.</p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p>You are highly suggested to <strong>rent a car</strong> for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through <a href="http://www.orbitz.com">www.orbitz.com</a>, or <a href="http://www.expedia.com">www.expedia.com</a> Feel free to email <a href="mailto:yxia@us.ibm.com">yxia@us.ibm.com</a> for any questions.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/seventh-tuc-meeting/attachments/6882333/15926330.png" alt=""></p> +<h6 id="getting-there"><strong>Getting there</strong></h6> +<p><strong>Upper and Eastern New England</strong></p> +<p>Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.</p> +<p><strong>New Haven and Connecticut Shores</strong></p> +<p>Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New Jersey</strong></p> +<p>Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>Upstate New York</strong></p> +<p>Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New York City (Manhattan)</strong></p> +<p>Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>John F. Kennedy International Airport</strong></p> +<p>North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>LaGuardia Airport</strong></p> +<p>East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.</p> +<p><strong>Newark International Airport</strong></p> +<p>North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.</p> +<p><strong>Stewart International Airport</strong></p> +<p>Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.</p> +<p><strong>Westchester County Airport</strong></p> +<p>Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.</p> +<p><strong>Public Transportation</strong></p> +<p>Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.</p> + + + + + Elements of Instance Matching Benchmarks: a Short Overview + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + Tue, 16 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + <p>The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using <strong>instance matching</strong> techniques and tools. Instance matching is also known as <strong>record linkage</strong> <a href="#references">[1]</a>, <strong>duplicate detection</strong> <a href="#references">[2]</a>, <strong>entity resolution</strong> <a href="#references">[3]</a> and <strong>object identification</strong> <a href="#references">[4]</a>.</p> +<p>For instance, a search in Geonames (<a href="http://www.geonames.org/">http://www.geonames.org/</a>) for &ldquo;Athens&rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (<a href="http://dbpedia.org/">http://dbpedia.org/</a>) or Open Government Datasets (<a href="http://data.gov.gr/">http://data.gov.gr/</a>). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.</p> +<p>Web resources are published by &ldquo;autonomous agents&rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years <a href="#references">[2]</a><a href="#references">[5]</a>.</p> +<p>It is though essential at this point to develop, along with instance and entity matching systems, <em>instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs</em>. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.</p> +<p>An instance matching benchmark for Linked Data consists of a <em>source</em> and <em>target dataset</em> implementing a set of <em>test-cases</em>, where each test case addresses a different kind of requirement regarding instance matching, a <em>ground truth</em> or <em>gold standard</em> and finally the <em>evaluation metrics</em> used to <em>assess the benchmark.</em></p> +<p>Datasets are the raw material of a benchmark. A benchmark comprises of a <em>source</em> and <em>target</em> dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their <em>nature</em> (<em>real</em> or <em>synthetic</em>), (b) the <em>schemas/ontologies</em> they use, (c) their <em>domains</em>, (d) the <em>languages</em> they are written in, and (e) the <em>variations/heterogeneities</em> of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. <em>Synthetic datasets</em> are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner</p> +<p>Datasets (and benchmarks) may contain different <em>kinds of variations</em> that correspond to <em>different test cases</em>. According to Ferrara et.al. <a href="#references">[6]</a><a href="#references">[7]</a>, three kinds of variations exist for Linked Data, namely <em>data variations</em>, <em>structural variations</em> and <em>logical variations</em>. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.</p> +<p>The common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.</p> +<p>The <em>gold standard</em> is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.</p> +<p>Last, an instance matching benchmark uses <em>evaluation metrics</em> to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard <em>precision</em>, <em>recall</em> and <em>f-measure</em> metrics.</p> +<h4 id="references">References</h4> +<p>[1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.</p> +<p>[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).</p> +<p>[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.</p> +<p>[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.</p> +<p>[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.</p> +<p>[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).</p> +<p>[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + Wed, 10 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + <p>In this post we will look at running the <a href="https://ldbcouncil.org/developer/snb">LDBC SNB</a> on <a href="https://virtuoso.openlinksw.com/">Virtuoso</a>.</p> +<p>First, let&rsquo;s recap what the benchmark is about:</p> +<ol> +<li> +<p>fairly frequent short updates, with no update contention worth mentioning</p> +</li> +<li> +<p>short random lookups</p> +</li> +<li> +<p>medium complex queries centered around a person&rsquo;s social environment</p> +</li> +</ol> +<p>The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.</p> +<p>The DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, <em>per se,</em> since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.</p> +<p>The workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.</p> +<p>The test system is the same as used in the <a href="http://www.openlinksw.com/weblog/oerling/?id=1739">TPC-H series</a>: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics branch</a> of <a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack, available from www.github.com</a>.</p> +<p>The dataset is the SNB 300G set, with:</p> +<table> +<thead> +<tr> +<th>1,136,127</th> +<th>persons</th> +</tr> +</thead> +<tbody> +<tr> +<td>125,249,604</td> +<td>knows edges</td> +</tr> +<tr> +<td>847,886,644</td> +<td>posts, including replies</td> +</tr> +<tr> +<td>1,145,893,841</td> +<td>tags of posts or replies</td> +</tr> +<tr> +<td>1,140,226,235</td> +<td>likes of posts or replies</td> +</tr> +</tbody> +</table> +<p>As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.</p> +<p>Below are the numerical quantities for a 400K operation run after 150K operations worth of warmup.</p> +<p><strong>Duration:</strong> 10:41.251<br> +<strong>Throughput:</strong> 623.71 (op/s)</p> +<p>The statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.</p> +<table> +<thead> +<tr> +<th>% of total</th> +<th>total_wait</th> +<th>name</th> +<th>count</th> +<th>mean</th> +<th>min</th> +<th>max</th> +</tr> +</thead> +<tbody> +<tr> +<td>20%</td> +<td>4,231,130</td> +<td>LdbcQuery5</td> +<td>656</td> +<td>6,449.89</td> +<td>245</td> +<td>10,311</td> +</tr> +<tr> +<td>11%</td> +<td>2,272,954</td> +<td>LdbcQuery8</td> +<td>18,354</td> +<td>123.84</td> +<td>14</td> +<td>2,240</td> +</tr> +<tr> +<td>10%</td> +<td>2,200,718</td> +<td>LdbcQuery3</td> +<td>388</td> +<td>5,671.95</td> +<td>468</td> +<td>17,368</td> +</tr> +<tr> +<td>7.3%</td> +<td>1,561,382</td> +<td>LdbcQuery14</td> +<td>1,124</td> +<td>1,389.13</td> +<td>4</td> +<td>5,724</td> +</tr> +<tr> +<td>6.7%</td> +<td>1,441,575</td> +<td>LdbcQuery12</td> +<td>1,252</td> +<td>1,151.42</td> +<td>15</td> +<td>3,273</td> +</tr> +<tr> +<td>6.5%</td> +<td>1,396,932</td> +<td>LdbcQuery10</td> +<td>1,252</td> +<td>1,115.76</td> +<td>13</td> +<td>4,743</td> +</tr> +<tr> +<td>5%</td> +<td>1,064,457</td> +<td>LdbcShortQuery3PersonFriends</td> +<td>46,285</td> +<td>22.9979</td> +<td>0</td> +<td>2,287</td> +</tr> +<tr> +<td>4.9%</td> +<td>1,047,536</td> +<td>LdbcShortQuery2PersonPosts</td> +<td>46,285</td> +<td>22.6323</td> +<td>0</td> +<td>2,156</td> +</tr> +<tr> +<td>4.1%</td> +<td>885,102</td> +<td>LdbcQuery6</td> +<td>1,721</td> +<td>514.295</td> +<td>8</td> +<td>5,227</td> +</tr> +<tr> +<td>3.3%</td> +<td>707,901</td> +<td>LdbcQuery1</td> +<td>2,117</td> +<td>334.389</td> +<td>28</td> +<td>3,467</td> +</tr> +<tr> +<td>2.4%</td> +<td>521,738</td> +<td>LdbcQuery4</td> +<td>1,530</td> +<td>341.005</td> +<td>49</td> +<td>2,774</td> +</tr> +<tr> +<td>2.1%</td> +<td>440,197</td> +<td>LdbcShortQuery4MessageContent</td> +<td>46,302</td> +<td>9.50708</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.9%</td> +<td>407,450</td> +<td>LdbcUpdate5AddForumMembership</td> +<td>14,338</td> +<td>28.4175</td> +<td>0</td> +<td>2,008</td> +</tr> +<tr> +<td>1.9%</td> +<td>405,243</td> +<td>LdbcShortQuery7MessageReplies</td> +<td>46,302</td> +<td>8.75217</td> +<td>0</td> +<td>2,112</td> +</tr> +<tr> +<td>1.9%</td> +<td>404,002</td> +<td>LdbcShortQuery6MessageForum</td> +<td>46,302</td> +<td>8.72537</td> +<td>0</td> +<td>1,968</td> +</tr> +<tr> +<td>1.8%</td> +<td>387,044</td> +<td>LdbcUpdate3AddCommentLike</td> +<td>12,659</td> +<td>30.5746</td> +<td>0</td> +<td>2,060</td> +</tr> +<tr> +<td>1.7%</td> +<td>361,290</td> +<td>LdbcShortQuery1PersonProfile</td> +<td>46,285</td> +<td>7.80577</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.6%</td> +<td>334,409</td> +<td>LdbcShortQuery5MessageCreator</td> +<td>46,302</td> +<td>7.22234</td> +<td>0</td> +<td>2,055</td> +</tr> +<tr> +<td>1%</td> +<td>220,740</td> +<td>LdbcQuery2</td> +<td>1,488</td> +<td>148.347</td> +<td>2</td> +<td>2,504</td> +</tr> +<tr> +<td>0.96%</td> +<td>205,910</td> +<td>LdbcQuery7</td> +<td>1,721</td> +<td>119.646</td> +<td>11</td> +<td>2,295</td> +</tr> +<tr> +<td>0.93%</td> +<td>198,971</td> +<td>LdbcUpdate2AddPostLike</td> +<td>5,974</td> +<td>33.3062</td> +<td>0</td> +<td>1,987</td> +</tr> +<tr> +<td>0.88%</td> +<td>189,871</td> +<td>LdbcQuery11</td> +<td>2,294</td> +<td>82.7685</td> +<td>4</td> +<td>2,219</td> +</tr> +<tr> +<td>0.85%</td> +<td>182,964</td> +<td>LdbcQuery13</td> +<td>2,898</td> +<td>63.1346</td> +<td>1</td> +<td>2,201</td> +</tr> +<tr> +<td>0.74%</td> +<td>158,188</td> +<td>LdbcQuery9</td> +<td>78</td> +<td>2,028.05</td> +<td>1,108</td> +<td>4,183</td> +</tr> +<tr> +<td>0.67%</td> +<td>143,457</td> +<td>LdbcUpdate7AddComment</td> +<td>3,986</td> +<td>35.9902</td> +<td>1</td> +<td>1,912</td> +</tr> +<tr> +<td>0.26%</td> +<td>54,947</td> +<td>LdbcUpdate8AddFriendship</td> +<td>571</td> +<td>96.2294</td> +<td>1</td> +<td>988</td> +</tr> +<tr> +<td>0.2%</td> +<td>43,451</td> +<td>LdbcUpdate6AddPost</td> +<td>1,386</td> +<td>31.3499</td> +<td>1</td> +<td>2,060</td> +</tr> +<tr> +<td>0.01%</td> +<td>1,848</td> +<td>LdbcUpdate4AddForum</td> +<td>103</td> +<td>17.9417</td> +<td>1</td> +<td>65</td> +</tr> +<tr> +<td>0.00%</td> +<td>44</td> +<td>LdbcUpdate1AddPerson</td> +<td>2</td> +<td>22</td> +<td>10</td> +<td>34</td> +</tr> +</tbody> +</table> +<p>At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.</p> +<p>The implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.</p> +<p>The set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:</p> +<ul> +<li> +<p><em>Cardinality estimation under heavy data skew —</em> Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.</p> +</li> +<li> +<p><em>Covering indices —</em> Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post&rsquo;s creation date.</p> +</li> +<li> +<p><em>Multi-hop graph traversal —</em> Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.</p> +</li> +<li> +<p><em>Top <em>K</em> —</em> Most queries returning posts order results by descending date. Once there are at least <em>k</em> results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top <em>k</em>.</p> +</li> +<li> +<p><em>Late projection —</em> Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.</p> +</li> +<li> +<p><em>Materialization —</em> Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.</p> +</li> +<li> +<p><em>Concurrency control —</em> Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.</p> +</li> +</ul> +<p>In subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + SNB and Graphs Related Presentations at GRADES '15 + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + Fri, 29 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + <p>Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.</p> +<p>Among the papers published in this edition we have &ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms&rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in <a href="https://github.com/ldbc">https://github.com/ldbc</a>) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have &ldquo;Microblogging Queries on Graph Databases: an Introspection&rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention &ldquo;Frappé: Querying the Linux Kernel Dependency Graph&rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.</p> +<p><a href="http://event.cwi.nl/grades2015/program.shtml">Check the complete agenda.</a></p> +<p>Meet you in Melbourne!</p> + + + + + SNB Interactive Part 2: Modeling Choices + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + Tue, 26 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + <p><a href="https://ldbcouncil.org/benchmarks/snb">​SNB Interactive</a> is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.</p> +<p>In the case of <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server">Virtuoso</a>, we have played with <a href="http://dbpedia.org/resource/SQL">SQL</a> and <a href="http://dbpedia.org/resource/SPARQL">SPARQL</a> implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.</p> +<h3 id="schema-design">Schema Design</h3> +<p>SNB has a regular schema described by a <a href="https://en.wikipedia.org/wiki/Unified_Modeling_Language">UML</a> diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.</p> +<p>The only table-level choice has to do with whether <code>posts</code> and <code>comments</code> are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.</p> +<p>The workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key <code>of ps_creatorid</code>, <code>ps_creationdate</code>, <code>ps_postid</code> pays off since the top-k on <code>creationdate</code> can be pushed down into the index without needing a reference to the table.</p> +<p>The implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the <a href="http://dbpedia.org/resource/DEX_(Graph_database)">Sparksee</a> and <a href="http://dbpedia.org/resource/Neo4j">Neo4J</a> implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.</p> +<p>The benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 &ldquo;top of the wall&rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. <code>person1</code>, <code>person2</code> -&gt; <code>weight</code>. <code>Person1</code> is by convention the one with the smaller <code>p_personid</code>. Note that comparing id&rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI&rsquo;s with disastrous performance implications unless an implementation specific trick were used.</p> +<p>In the next installment we will analyze an actual run.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + Mon, 25 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + <p>LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.</p> +<p>On the industry track, LDBC will be presenting the <em>Social Network Benchmark Interactive Workload</em> by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).</p> +<p>You can read more about the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark here</a> and collaborate if you&rsquo;re interested!</p> +<p>The other presentation will be at the GRADES workshop within the SIGMOD program regarding <em>Graphalytics: A Big Data Benchmark for Graph-Processing platforms</em> by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.</p> +<p>Don&rsquo;t forget to check our presentations if you&rsquo;re attending the SIGMOD!</p> + + + + + SNB Interactive Part 1: What Is SNB Interactive Really About? + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + Thu, 14 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + <p>This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.</p> +<p>With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.</p> +<p>The essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.</p> +<p>So far, we see that SNB confronts the implementor with choices in the following areas:</p> +<ul> +<li>Data model: Relational, RF, property graph?</li> +<li>Physical model, e.g. row-wise vs. column wise storage</li> +<li>Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures</li> +<li>Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers</li> +<li>Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc.</li> +<li>Parameters vs. literals: Sometimes different parameter values result in different optimal query plans</li> +<li>Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload</li> +<li>Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing.</li> +</ul> +<p>In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.</p> +<ul> +<li>Data generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option.</li> +<li>Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce?</li> +<li>Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a &lsquo;fast&rsquo; and &lsquo;slow&rsquo; case of a single query template. How long does one need to run to balance these fluctuations?</li> +<li>Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput.</li> +<li>Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse.</li> +<li>Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable.</li> +<li>Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy?</li> +<li>Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters?</li> +</ul> +<p>The following posts will look at the above in light of actual experience.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + Why Do We Need an LDBC SNB-Specific Workload Driver? + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + Tue, 21 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + <p>In a previous <a href="https://ldbcouncil.org/tags/driver">3-part blog series</a> we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn&rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - &ldquo;graph&rdquo; is the word that best captures the novelty and difficulty of this work.</p> +<p><strong>Workload Execution - Traditional vs Graph</strong></p> +<p>Transactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.</p> +<p>To understand what is meant by &ldquo;traditional relational workloads&rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, &ldquo;dependency&rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator&rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order <em>for a given user</em>. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.</p> +<p>A significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user&rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.</p> +<p>Such scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend&rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. <em>Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.</em></p> +<p><strong>Because it&rsquo;s a graph</strong></p> +<p>In short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.</p> + + + + + Event Driven Post Generation in Datagen + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + Fri, 10 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + <p>As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.</p> +<p>First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:</p> +<ul> +<li> +<p>The wall of the person</p> +</li> +<li> +<p>The albums of the person</p> +</li> +<li> +<p>The groups where the person is a moderator</p> +</li> +</ul> +<p>We will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.</p> +<p>After assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.</p> +<p>Flashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to &ldquo;Enrique Iglesias&rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.</p> +<p>Once the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:</p> +<ul> +<li> +<p>Determine the number of posts to generate</p> +</li> +<li> +<p>Select a random member of the group that will generate the post</p> +</li> +<li> +<p>Determine the event the post will be related to given the aforementioned cumulative distribution</p> +</li> +<li> +<p>Assign the date of the post based on the event date</p> +</li> +</ul> +<p>In order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from <a href="#references">[1]</a>. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.</p> +<p><img src="index.png" alt=""></p> +<p>Following the example of &ldquo;Enrique Iglesias&rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.</p> +<p><img src="index2.png" alt=""></p> +<p>In this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.</p> +<h4 id="references">References</h4> +<p>[1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506</p> + + + + + Sixth TUC Meeting + https://ldbcouncil.org/event/sixth-tuc-meeting/ + Thu, 19 Mar 2015 13:53:33 -0400 + + https://ldbcouncil.org/event/sixth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on <strong>Thursday and Friday March 19/20, 2015.</strong></p> +<p>The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the first benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.</li> +<li>Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>.</p> +<h3 id="agenda">Agenda</h3> +<p><strong>Thursday 19th March</strong></p> +<p>11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)</p> +<p>11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981131.pdf">slides</a></p> +<p>12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)</p> +<p>12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981137.pdf">slides</a></p> +<p>12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain</p> +<p>12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive</p> +<p>13:10 Claudio Martella (VUA): Giraph and Lighthouse</p> +<p>13:30 - 14:30 Lunch break</p> +<p>14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)</p> +<p>14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981132.pdf">slides</a></p> +<p>14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981133.pdf">slides</a></p> +<p>15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981139.pdf">slides</a></p> +<p>15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs</p> +<p>18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.</p> +<p>20:00 Social dinner at <a href="http://www.bastaix.com">Bastaix Restaurant</a>.</p> +<p><strong>Friday 20th March</strong></p> +<p>9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)</p> +<p>9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics</p> +<p>9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms</p> +<p>10:10 John Snelson (MarkLogic): Introduction to MarkLogic</p> +<p>10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload</p> +<p>10:50 Moritz Kaufmann. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/moritz-kaufmann-ldbc-snb-benchmark-auditing-6th-ldbc-tuc.pdf">The auditing experience</a></p> +<p>11:15 - 11:45 Coffee break</p> +<p>11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)</p> +<p>11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox</p> +<p>12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data</p> +<p>12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments</p> +<p>12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981138.pdf">slides</a></p> +<p>13:30 - 14:30 Lunch break</p> +<p>15:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>19th and 20th March 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held at &ldquo;Aula Master&rdquo; at A3 building located inside the &ldquo;Campus Nord UPC&rdquo; in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h5 id="maps-and-situation"><strong>Maps and situation</strong></h5> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<h5 id="finding-upc"><strong>Finding UPC</strong></h5> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<h5 id="finding-the-meeting-room"><strong>Finding the meeting room</strong></h5> +<h5 id="getting-there">Getting there</h5> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to<br> +the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<h5 id="the-locations-of-the-airport-and-the-city-centre"><strong>The locations of the airport and the city centre</strong></h5> + + + + + The LDBC Datagen Community Structure + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + Sun, 15 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + <p>This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.</p> +<p>When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.</p> +<p>The first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 <a href="#references">[1]</a>. Here we summarize the paper and its contributions and findings.</p> +<p>Existing synthetic graph generators such as Rmat <a href="#references">[1]</a> and Mag <a href="#references">[2]</a>, are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR <a href="#references">[3]</a>, a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.</p> +<p>However, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of <a href="#references">[1]</a> set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to <a href="#references">[4]</a>.</p> +<table> +<thead> +<tr> +<th></th> +<th><em>Nodes</em></th> +<th><em>Edges</em></th> +</tr> +</thead> +<tbody> +<tr> +<td><em>Amazon</em></td> +<td>334863</td> +<td>925872</td> +</tr> +<tr> +<td><em>Dblp</em></td> +<td>317080</td> +<td>1049866</td> +</tr> +<tr> +<td><em>Youtube</em></td> +<td>1134890</td> +<td>2987624</td> +</tr> +<tr> +<td><em>Livejournal</em></td> +<td>3997962</td> +<td>34681189</td> +</tr> +</tbody> +</table> +<p>The authors of <a href="#references">[1]</a> selected a set of statistical indicators to<br> +characterize the communities:</p> +<ul> +<li>The clustering coefficient</li> +<li>The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community.</li> +<li>The bridge ratio, which is the ratio of edges whose removal disconnects the community.</li> +<li>The diameter</li> +<li>The conductance</li> +<li>The size</li> +</ul> +<p>The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.</p> +<ul> +<li>Many of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community.</li> +<li>Most of the communities are small (10 or less nodes).</li> +<li>In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index.png" alt=""></td> +<td style="text-align:center"><img src="index2.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index3.png" alt=""></td> +<td style="text-align:center"><img src="index4.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">Diameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index5.png" alt=""></td> +<td style="text-align:center"><img src="index6.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index8.png" alt=""></td> +<td style="text-align:center"><img src="index9.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index10.png" alt=""></td> +<td style="text-align:center"><img src="index11.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index11.png" alt=""></td> +<td style="text-align:center"><img src="index12.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The main conclusions that can be extracted from DATAGEN can be summarized asfollows:</p> +<ul> +<li>DATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio.</li> +<li>The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs.</li> +<li>Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger.</li> +<li>The diameter is small like in the real graphs.</li> +<li>It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics.</li> +</ul> +<p>Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:</p> +<ul> +<li>LFR graphs donot show the multimodal distribution observed in real graphs</li> +<li>Only the diameter shows a similar shape as in the ground truth.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index13.png" alt=""></td> +<td style="text-align:center"><img src="index14.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index15.png" alt=""></td> +<td style="text-align:center"><img src="index16.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index17.png" alt=""></td> +<td style="text-align:center"><img src="index18.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman&rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index19.png" alt=""></td> +<td style="text-align:center"><img src="index20.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index21.png" alt=""></td> +<td style="text-align:center"><img src="index22.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index23.png" alt=""></td> +<td style="text-align:center"><img src="index24.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!</p> +<h4 id="references">References</h4> +<p>[1] Arnau Prat-Pérez, <a href="http://dblp.uni-trier.de/pers/hd/d/Dom=iacute=nguez=Sal:David">David Domínguez-Sal</a>: How community-like is the structure of synthetically generated graphs? <a href="http://dblp.uni-trier.de/db/conf/sigmod/grades2014.html#PratD14">GRADES 2014</a></p> +<p>[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014</p> +<p>[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics</p> +<p>[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.</p> + + + + + Industry Relevance of the Semantic Publishing Benchmark + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + Tue, 03 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + <h3 id="publishing-and-media-businesses-are-going-through-transformation">Publishing and media businesses are going through transformation</h3> +<p>I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.</p> +<p>What’s driving this change in publishing and media?</p> +<ul> +<li> +<p>Widespread and instantaneous distribution of information over the Internet has turned news into somewhat of a &ldquo;commodity&rdquo; and few people are willing to pay for it</p> +</li> +<li> +<p>The wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;</p> +</li> +<li> +<p>Open access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.</p> +</li> +</ul> +<p><em>Alongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.</em></p> +<h3 id="imagine-instant-news-in-context-imagine-personal-channels-imagine--triplestores">Imagine instant news in context, Imagine personal channels, Imagine &hellip; triplestores</h3> +<p>While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in &ldquo;near real time&rdquo; is difficult and expensive using legacy processes and content management technology.</p> +<p>Another example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like <a href="http://new.dowjones.com/products/factiva/">Factiva</a>, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.</p> +<p>Finally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.</p> +<p>Many publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:</p> +<ol> +<li>Generation of rich and &ldquo;meaningful&rdquo; (trying not to use &ldquo;semantic&rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery.</li> +</ol> +<p>In this post I write about &ldquo;semantic annotation&rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.</p> +<h3 id="semantic-annotation-produces-rich-metadata-descriptions--the-fuel-for-semantic-publishing">Semantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing</h3> +<p>The most popular meaning of &ldquo;semantic annotation&rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.</p> +<p><img src="02_semantic_repository.png" alt=""></p> +<p>The concept of using <a href="http://infosys3.elfak.ni.ac.rs/nastava/attach/SemantickiWebKurs/sdarticle.pdf">text-mining for automatic semantic annotation</a> of text with respect to very large datasets, such as <a href="http://dbpedia.org/">DBPedia</a>, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether &ldquo;Paris&rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether &ldquo;Hilton&rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.</p> +<p>Today there are plenty of tools (such as the <a href="https://www.ontotext.com/semantic-solutions/media-publishing/">Ontotext Media and Publishing</a> platform and <a href="https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki">DBPedia Spotlight</a>) and services (such as Thomson Reuter’s <a href="http://www.opencalais.com/">OpenCalais</a> and Ontotext’s <a href="http://s4.ontotext.com">S4</a>) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.</p> +<h3 id="the-bbcs-dynamic-semantic-publishing-dsp">The BBC’s Dynamic Semantic Publishing (DSP)</h3> +<p>Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.</p> +<p>BBC Future Media &amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.</p> +<p><em>&ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories&rdquo;.</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">Jem Rayfield, Senior Technical Architect</a>, BBC News and Knowledge</p> +<p>The Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (&ldquo;edited by exception&rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.</p> +<p><em>&ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#language">natural language and ontological determiner process</a> automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.</em></p> +<p><img src="03_bbc_sport.png" alt=""></p> +<p><em>Journalist-published metadata is captured and made persistent for querying using the resource description framework (<a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#RDF"><em>RDF</em></a>) metadata representation and triple store technology. <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#BigOWLIM">A RDF triplestore</a> and <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#SPARQL">SPARQL</a> approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept &ldquo;Frank Lampard&rdquo;, then the framework infers and applies concepts such as &ldquo;England Squad&rdquo;, &ldquo;Group C&rdquo; and &ldquo;FIFA World Cup 2010&rdquo; &hellip;&rdquo;</em> &ndash; Jem Rayfield</p> +<p>One can consider each of the &ldquo;aggregation pages&rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.</p> +<p><em>&ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content</em></p> +<p><strong>…</strong><strong><em>we are not publishing pages, but publishing content</em></strong> <em>as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.</em></p> +<p><img src="04_content_tagging.png" alt=""></p> +<p><em>… The index pages are published automatically. This process is what assures us of the highest quality output, but still <strong>save large amounts of time</strong> in managing the site and <strong>makes it possible for us to efficiently run so many pages</strong> for the World Cup.&rdquo;</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/the_world_cup_and_a_call_to_ac.html">John O&rsquo;Donovan, Chief Technical Architect, BBC Future Media &amp; Technology</a></p> +<p>To get a real feeling about the load of the triplestore behind BBC&rsquo;s World Cup web site, here are some statistics:</p> +<ul> +<li> +<p>800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;</p> +</li> +<li> +<p>Average unique page requests/day: 2 million;</p> +</li> +<li> +<p>Average <strong>SPARQL queries/day: 1 million;</strong></p> +</li> +<li> +<p><strong>100s repository updates/inserts per minute</strong> with OWL 2 RL reasoning;</p> +</li> +<li> +<p>Multi data center that is fully resilient, clustered 6 node triplestore.</p> +</li> +</ul> +<h3 id="the-semantic-publishing-benchmark">The Semantic Publishing Benchmark</h3> +<p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).</p> +<p>SPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:</p> +<ul> +<li> +<p><em>Reference knowledge:</em> taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);</p> +</li> +<li> +<p><em>Metadata</em> for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.</p> +</li> +</ul> +<p>In this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:</p> +<ul> +<li> +<p><em>Aggregation queries</em> retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;</p> +</li> +<li> +<p><em>Updates</em>, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.</p> +</li> +</ul> +<p>SPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from <a href="http://www.geonames.org/">Geonames</a> for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.</p> +<p>A more technical introduction to SPB can be found in this <a href="https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark">post</a>. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">post</a>. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.</p> +<p>Despite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of &ldquo;fast flowing&rdquo; content need to be &ldquo;dispatched&rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:</p> +<ul> +<li> +<p>The Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;</p> +</li> +<li> +<p>Reasoning is needed to map content descriptions to queries in a flexible manner;</p> +</li> +<li> +<p>There are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.</p> +</li> +</ul> +<h3 id="spb-v20--steeper-for-the-engines-closer-to-the-publishers">SPB v.2.0 – steeper for the engines, closer to the publishers</h3> +<p>We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.</p> +<p>The major changes in SPB v.2.0 can be summarized as follows:</p> +<ul> +<li> +<p>Much bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;</p> +</li> +<li> +<p>Better interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;</p> +</li> +<li> +<p>Retrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.</p> +</li> +</ul> + + + + + OWL-Empowered SPARQL Query Optimization + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + Wed, 18 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + <p>The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.</p> +<p>In this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.</p> +<p>OWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.</p> +<p>This richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.</p> +<p>The aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found <a href="LDBC_D4.4.2_final.pdf">here</a>.</p> +<h3 id="schema-based-optimization-techniques">Schema-Based Optimization Techniques</h3> +<p>Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.</p> +<p>A simple first case is the case of constraint violation. Consider the query below, which returns all instances of class <code>&lt;A&gt;</code> which are fillers of a specific property <code>&lt;P&gt;</code>. If the underlying schema contains the information that the range of <code>&lt;P&gt;</code> is class <code>&lt;B&gt;</code>, and that class <code>&lt;B&gt;</code> is disjoint from class <code>&lt;A&gt;</code>, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.</p> +<pre tabindex="0"><code>SELECT ?v +WHERE { ?v rdf : type &lt;A&gt; . + ?u &lt;P&gt; ?v . ?u &lt;P&gt; ?v1 . + ?u &lt;P1 &gt; ?v2 . ?u &lt;P2 &gt; ?v3 . + ?u &lt;P3 &gt; ?v4 . ?u &lt;P4 &gt; ?v5} +</code></pre><p>Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class <code>&lt;A1&gt;</code> can immediately be identified as not being in the answer set.</p> +<pre tabindex="0"><code>SELECT ?c +WHERE { ?x rdf: type ?c . ?x &lt;P&gt; ?y . + FILTER NOT EXISTS \{ ?x rdf: type &lt;A1 &gt; }} +</code></pre><p>Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.</p> +<p>Similarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).</p> +<p>As an example of these patterns, consider the query below, where class <code>&lt;C&gt;</code> is defined as the intersection of classes <code>&lt;C1&gt;</code>,<code> &lt;C2&gt;</code>. Thus, the triple pattern <code>(?x rdf:type &lt;C&gt;)</code> is more selective than <code>(?y rdf:type &lt;C1&gt;)</code> and <code>(?z rdf:type &lt;C2&gt;)</code> and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.</p> +<pre tabindex="0"><code>SELECT ?x +WHERE { ?x rdf: type &lt;C&gt; . ?x &lt;P1 &gt; ?y . + ?y rdf : type &lt;C1 &gt; . ?y &lt;P2 &gt; ?z . ?z rdf : type &lt;C2 &gt; } +</code></pre><p><img src="owl_constraints.png" alt="image"></p> +<p>Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property <code>P4</code> is a symmetric property.</p> +<pre tabindex="0"><code>SELECT ?y ?y1 ?y2 ?y3 +WHERE { ?x &lt;P1 &gt; ?y . ?x &lt;P2 &gt; ?y1 . + ?x &lt;P3 &gt; ?y2 . ?y3 &lt;P4 &gt; ?x } +</code></pre><h3 id="conclusion">Conclusion</h3> +<p>In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:</p> +<ul> +<li>Cases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results.</li> +<li>Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing.</li> +<li>Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query.</li> +<li>Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques.</li> +</ul> +<p>This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.</p> + + + + + Person Activity Subgraph Features in LDBC DATAGEN + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + Wed, 04 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + <p>When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.</p> +<p>When looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.</p> +<h3 id="reaslistic-message-content">Reaslistic Message Content</h3> +<p>Messages&rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post&rsquo;s tags to other related or randomly selected tags.</p> +<h3 id="non-uniform-activity-levels">Non uniform activity levels</h3> +<p>In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.</p> +<h3 id="time-correlated-post-and-comment-generation">Time correlated post and comment generation</h3> +<p>In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in <a href="#references">[1]</a>. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.</p> +<p><img src="1.png" alt="image"></p> +<p>As we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.</p> +<p>In this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.</p> +<h4 id="references">References</h4> +<p>[1] Leskovec, J., Backstrom, L., &amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In <em>Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 497-506). ACM.</p> + + + + + SNB Driver - Part 2: Tracking Dependencies Between Queries + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + Fri, 23 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + <p>The <a href="https://ldbcouncil.org/post/snb-driver-part-1">SNB Driver part 1</a> post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we&rsquo;ll drill down deeper into the details of what it means to execute &ldquo;dependent queries&rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.</p> +<h3 id="definitions">Definitions</h3> +<ul> +<li> +<p><em>Simulation Time (ST)</em>: notion of time created by data generator. All time stamps in the generated data set are in simulation time</p> +</li> +<li> +<p><em>Real Time (RT)</em>: wall clock time</p> +</li> +<li> +<p><em>Time Compression Ratio</em>: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark</p> +</li> +<li> +<p><em>Operation</em>: read and/or write</p> +</li> +<li> +<p><em>Dependencies</em>: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed</p> +</li> +<li> +<p><em>Dependents</em>: operations in this set are dependent on at least one other operation (in Dependencies) in the workload</p> +</li> +<li> +<p><em>Due Time (DueT)</em>: point in simulation time at which the execution of an operation should be initiated.</p> +</li> +<li> +<p><em>Dependent Time (DepT)</em>: in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.</p> +</li> +<li> +<p><em>Safe Time (SafeT)</em>: time duration.</p> +<ul> +<li> +<p>when two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them</p> +</li> +<li> +<p>SafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents</p> +</li> +</ul> +</li> +<li> +<p>​<em>Operation Stream</em>: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)</p> +</li> +<li> +<p><em>Initiated Operations</em>: operations that have started executing but not yet finished</p> +</li> +<li> +<p><em>Local Completion Time (per driver)</em>: point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))</p> +</li> +<li> +<p><em>Global Completion Time (GCT)</em>: minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​</p> +</li> +<li> +<p><em>Execution Window (Window)</em>: a timespan within which all operations can be safely executed</p> +<ul> +<li> +<p>All operations satisfying window.startTime &lt;= operation.DueT &lt; window.endTime may be executed</p> +</li> +<li> +<p>Within a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window</p> +</li> +<li> +<p>To ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 &lt; window.duration &lt;= SafeT</p> +</li> +<li> +<p>Window duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable</p> +</li> +<li> +<p>Before any operations within a window can start executing it is required that: GCT &gt;= window.startTime - (SafeT - window.duration)</p> +</li> +<li> +<p>All operations within a window must initiate and complete between window start and end times: window.startTime &lt;= operation.initiate &lt; window.endTime and window.startTime &lt;= operation.complete &lt; window.endTime</p> +</li> +</ul> +</li> +<li> +<p><em>Dependency Mode</em>: defines dependencies, constraints on operation execution order</p> +</li> +<li> +<p><em>Execution Mode</em>: defines how the runtime should execute operations of a given type</p> +</li> +</ul> +<h3 id="tracking-dependencies">Tracking Dependencies</h3> +<p>Now, the fun part, making sure dependent operations are executed in the correct order.</p> +<p>Consider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).</p> +<p>Logically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:</p> +<ul> +<li>the set of operations that have started executing but not yet finished.</li> +</ul> +<p>Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:</p> +<ul> +<li>the set of operations that have started and finished executing.</li> +</ul> +<p>Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):</p> +<ul> +<li>the point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time.</li> +</ul> +<p>LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:</p> +<ul> +<li> +<p><em>Due Time</em>: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled</p> +</li> +<li> +<p><em>GCT</em>: every operation (from Dependencies) with a Due Time before this point in time has completed execution</p> +</li> +</ul> +<p>However, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:</p> +<ul> +<li>in addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute.</li> +</ul> +<p>Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.</p> +<h3 id="scalable-execution-in-the-presence-of-dependencies">Scalable execution in the Presence of Dependencies</h3> +<p>The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:</p> +<p>a) make the generated load less &lsquo;bursty&rsquo;</p> +<p>b) allow the driver to &lsquo;scale&rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.</p> +<p>In the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.</p> +<p>The reasons this approach is correct are as follows:</p> +<ul> +<li> +<p>Operations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked</p> +</li> +<li> +<p>The minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)</p> +</li> +<li> +<p>A window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until</p> +<p>GCT &gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT</p> +</li> +</ul> +<p>The advantages of such an execution mode are as follows:</p> +<ul> +<li> +<p>As no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window</p> +</li> +<li> +<p>Then, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic</p> +</li> +<li> +<p>Further, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to &lsquo;smoothen&rsquo; the load within a Window.</p> +</li> +</ul> +<p>As with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between &lsquo;workload resolution&rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the &lsquo;bursty&rsquo; part of the event stream.</p> +<p><img src="window-scheduling.png" alt="image"><br> +Figure 1. Window scheduling</p> +<p>This design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - &lsquo;what happens in the window stays in the window&rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.</p> + + + + + SNB Driver - Part 3: Workload Execution Putting It All Together + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + Tue, 20 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + <p>Up until now we have introduced the <a href="https://ldbcouncil.org/post/snb-driver-part-1">challenges faced when executing the LDBC SNB benchmark</a>, as well as explained <a href="https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries">how some of these are overcome</a>. With the foundations laid, we can now explain precisely how operations are executed.</p> +<p>Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.</p> +<h3 id="dependency-modes">Dependency Modes</h3> +<p>While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.</p> +<p>Another way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:</p> +<p><strong>• None</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>– Prior Execution: do nothing – After Execution: do nothing</p> +<p><strong>• Read Only</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: wait for GCT &gt;= operation.DepTime – After Execution: do nothing</p> +<p><strong>• Write Only</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<p><strong>• Read Write</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations, wait for GCT &lt; operation.DepT</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<h3 id="execution-modes">Execution Modes</h3> +<p>Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.</p> +<p><strong>• Asynchronous</strong>: operations are executed individually, when their Due Time arrives.</p> +<p>Motivation: This is the default execution mode, it executes operations as true to the workload definition as possible.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: unbounded</p> +<p>– Max Execution Time: unbounded</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay</p> +<p><strong>• Synchronous</strong>: operations are executed individually, sequentially, in blocking manner.</p> +<p>Motivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.</p> +<p>The alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler</p> +<p>– Execute When time &gt;= operation.DueT and previousOperation.completed == true (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: 1</p> +<p>– Max Execution Time: nextOperation.DueT - operation.DueT</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay</p> +<p><strong>• Partially Synchronous</strong> (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together</p> +<p>– Re-scheduling Before Execution: Yes, as long as the following still holds:</p> +<p>window.startTime &lt;= operation.DueT &lt; window.startTime + window.duration</p> +<p>Operations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified</p> +<p>– Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: number of operations within window</p> +<p>– Max Execution Time: (window.startTime + window.duration) - operation.DueT</p> +<p>– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration</p> +<h3 id="tying-it-back-to-ldbc-snb">Tying it back to LDBC SNB</h3> +<p>The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.</p> +<p>The LDBC SNB data can in fact be seen as a union of two parts:</p> +<ol> +<li> +<p>Core Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.</p> +</li> +<li> +<p>User Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the &ldquo;core&rdquo; part are satisfied (i.e., users don&rsquo;t post things before the profiles are created, etc.).</p> +</li> +</ol> +<p>In order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.</p> +<p>This property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.</p> +<p>On the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).</p> + + + + + Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + Tue, 13 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + <p>Until now we have discussed several aspects of the <a href="https://ldbcouncil.org/benchmarks/spb">Semantic Publishing Benchmark (SPB)</a> such as the <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">difference in performance between virtual and real servers configuration</a>, how to choose an <a href="https://ldbcouncil.org/post/making-semantic-publishing-execution-rules">appropriate query mix</a> for a benchmark run and our experience with using SPB in the development process of GraphDB for <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">finding performance issues</a>.</p> +<p>In this post we provide a step-by-step guide on how to run SPB using the <a href="http://rdf4j.org/">Sesame</a> RDF data store on a fresh install of <a href="http://releases.ubuntu.com/14.04.1/">Ubuntu Server 14.04.1</a>. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.</p> +<h3 id="prerequisites">Prerequisites</h3> +<p>We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:</p> +<ul> +<li>Git</li> +<li>Apache Ant 1.8 or higher</li> +<li>OpenJDK 6 or Oracle JDK 6 or higher</li> +<li>Apache Tomcat 7 or higher</li> +</ul> +<p>If you already have these components installed on your machine you can directly proceed to the next section: <em>Installing Sesame</em></p> +<p>Following are sample commands which can be used to install the required software components:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo apt-get install git +</span></span><span style="display:flex;"><span>sudo apt-get install ant +</span></span><span style="display:flex;"><span>sudo apt-get install default-jdk +</span></span><span style="display:flex;"><span>sudo apt-get install tomcat7 +</span></span></code></pre></div><p>Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.</p> +<p>After a successful installation of Apache Tomcat you should be able to get the default splash page <em>“It works”</em> when you open your web browser and enter the following address: http://&lt;your_ip_address&gt;:8080</p> +<h3 id="installing-sesame">Installing Sesame</h3> +<p>We will use current Sesame version 2.7.14. You can download it <a href="http://sourceforge.net/projects/sesame/files/Sesame%202/">here</a> or run following command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>wget <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download&#34;</span> <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> -O openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>Then extract the Sesame tarball:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>tar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>To deploy sesame you have to copy the two war files that are in <em>openrdf-sesame-2.7.14/war</em> to <em>/var/lib/tomcat7/webapps</em></p> +<p>From <em>openrdf-sesame-2.7.14/war</em> you can do it with command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>cp openrdf-*.war &lt;tomcat_install&gt;/webapps +</span></span></code></pre></div><p>Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.</p> +<p>By default the configuration directory is: <em>/usr/share/tomcat7/.aduna</em></p> +<p>Create the directory:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo mkdir /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Then change the ownership:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chown tomcat7 /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>And finally you should give the necessary permissions:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chmod o+rwx /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Now when you go to: http://&lt;your_ip_address&gt;:8080/openrdf-workbench/repositories</p> +<p>You should get a screen like this:</p> +<p><img src="01-Sesame-repo-list.png" alt="image"></p> +<h3 id="setup-spb">Setup SPB</h3> +<p>You can download the SPB code and find brief documentation on GitHub:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm">https://github.com/ldbc/ldbc_spb_bm</a></p> +<p>A detailed documentation is located here:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf">https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf</a></p> +<p>SPB offers many configuration options which control various features of the benchmark e.g.:</p> +<ul> +<li>query mixes</li> +<li>dataset size</li> +<li>loading datasets</li> +<li>number of agents</li> +<li>validating results</li> +<li>test conformance to OWL2-RL ruleset</li> +<li>update rate of agents</li> +</ul> +<p>Here we demonstrate how to generate a dataset and execute a simple test<br> +run with it.</p> +<p>First download the SPB source code from the repository:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>git clone https://github.com/ldbc/ldbc_spb_bm.git +</span></span></code></pre></div><p>Then in the ldbc_spb_bm directory build the project:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>ant build-basic-querymix +</span></span></code></pre></div><p>If you simply execute the command:</p> +<pre tabindex="0"><code>ant +</code></pre><p>you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.</p> +<p>Depending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat&rsquo;s startup files e.g. in <em>catalina.sh</em>:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>export JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;-d64 -Xmx4G&#34;</span> +</span></span></code></pre></div><p>To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:</p> +<p><img src="02-Sesame-create-repo.png" alt="image"></p> +<p>Then we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in <em>ldbc_spb_bm/dist/test.properties</em> file.</p> +<p>The default value of <em>datasetSize</em> in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.</p> +<p>You need to change</p> +<pre tabindex="0"><code>datasetSize=1000000 +</code></pre><p>Also the URLs of the SPARQL endpoint for the repository</p> +<pre tabindex="0"><code>endpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 +endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements +</code></pre><p>First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.</p> +<p>These are the settings to do that, following parameters will &lsquo;instruct&rsquo; the SPB test driver to perform all the actions described above:</p> +<pre tabindex="0"><code>#Benchmark Operational Phases +loadOntologies=true +loadReferenceDatasets=true +generateCreativeWorks=true +loadCreativeWorks=true +generateQuerySubstitutionParameters=true +validateQueryResults=false +warmUp=false +runBenchmark=false +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>To run the benchmark execute the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>java -jar semantic_publishing_benchmark-basic-standard.jar +</span></span><span style="display:flex;"><span>test.properties +</span></span></code></pre></div><p>When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.</p> +<p>Next we will measure the performance of Sesame Data Store by changing some configuration properties:</p> +<pre tabindex="0"><code>#Benchmark Configuration Parameters +warmupPeriodSeconds=60 +benchmarkRunPeriodSeconds=300 +... +#Benchmark Operational Phases +loadOntologies=false +loadReferenceDatasets=false +generateCreativeWorks=false +loadCreativeWorks=false +generateQuerySubstitutionParameters=false +validateQueryResults=false +warmUp=true +runBenchmark=true +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>After the benchmark test run has finished result files are saved in folder: <em>dist/logs</em></p> +<p>There you will find three types of results: the result summary of the benchmark run (<em>semantic_publishing_benchmark_results.log),</em> brief results and detailed results.</p> +<p>In <em>semantic_publishing_benchmark_results.log</em> you will find the results distributed per seconds. They should be similar to the listing bellow:</p> +<p>Benchmark Results for the 300-th second</p> +<pre tabindex="0"><code>Seconds : 300 (completed query mixes : 0) + Editorial: + 2 agents + + 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) + 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) + 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) + + 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) + 0.0300 average operations per second + + Aggregation: + 8 agents + + 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) + 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) + 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) + 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) + 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) + 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) + 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) + 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) + 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) +</code></pre><p>This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.</p> +<p>If you have any troubles running the benchmark, don&rsquo;t hesitate to comment or use our social media channels.</p> +<p>In a future post we will go through some of the parameters of SPB and check their performance implications.</p> + + + + + Semantic Publishing Instance Matching Benchmark + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + Tue, 30 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + <p>The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.</p> +<p>The SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:</p> +<ul> +<li>value-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity</li> +<li>structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation)</li> +</ul> +<p>The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:</p> +<ul> +<li>instance (in)equality (owl:sameAs, owl:differentFrom)</li> +<li>class and property equivalence (owl:equivalentClass, owl:equivalentProperty)</li> +<li>class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties)</li> +<li>class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf)</li> +<li>property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty)</li> +<li>complex class definitions (owl:unionOf, owl:intersectionOf)</li> +</ul> +<p>SPIMBench uses and extends the ontologies of LDBC&rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB&rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.</p> +<p>Value and structure-based test cases are implemented using the SWING framework <a href="#references">[1]</a> on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.</p> +<p>SPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.</p> +<p>SPIMBench can be downloaded from <a href="https://github.com/jsaveta/SPIMBench">our repository</a> and a more thorough description thereof can be found on <a href="http://www.ics.forth.gr/isl/spimbench/">http://www.ics.forth.gr/isl/spimbench/</a>.</p> +<h4 id="references">References</h4> +<p>[1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + Further Developments in SNB BI Workload + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + Thu, 18 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + <p>We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.</p> +<p>As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.</p> +<p>There are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.</p> +<p>Let’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.</p> +<p>One of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.</p> +<p>Another feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.</p> +<p>Since a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.</p> +<p>An orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.</p> +<p>So, let’s see how some of these aspects could be captured in the SNB context.</p> +<p>Geography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.</p> +<p>The communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -&gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.</p> +<p>The behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.</p> +<p>In John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”</p> +<p>Analytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.</p> +<p>From a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.</p> +<p>This leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.</p> +<p>There is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.</p> + + + + + Sizing AWS Instances for the Semantic Publishing Benchmark + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + Wed, 17 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + <p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">BBC Dynamic Semantic Publishing</a> scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">wrote earlier</a>, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.</p> +<p>Lately we tested different Amazon Web Services (<a href="https://aws.amazon.com/">AWS</a>) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that <a href="https://www.ontotext.com/products/ontotext-graphdb/">GraphDB</a> experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.</p> +<h3 id="the-experiment">The Experiment</h3> +<p>For our tests we use:</p> +<ul> +<li>GraphDB Standard v6.1</li> +<li>LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: +<ul> +<li>8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also</li> +<li>50M dataset (SF1)</li> +<li>40 minutes of benchmark run time (60 seconds of warm up)</li> +</ul> +</li> +<li>5 different Amazon EC2 instances and one local server</li> +</ul> +<p>Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.</p> +<p>We use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.</p> +<p>We also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.</p> +<h3 id="the-results">The Results</h3> +<p>For the tests we measured:</p> +<ul> +<li><em>queries/s</em> for the read threads, where queries include SELECT and CONSTRUCT</li> +<li><em>updates/s</em> for the write threads, where an update operation is INSERT or DELETE</li> +<li><em>queries/$</em> and <em>updates/$</em> – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput</li> +<li><em>update/vCPU</em> – modification operations per vCPU per second</li> +</ul> +<p>Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.</p> +<p>The improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.</p> +<p>Table 1. SPB Measurement Results on AWS and Local Servers</p> +<table> +<thead> +<tr> +<th>Server Type</th> +<th>vCPUs</th> +<th>R/W Agents</th> +<th>RAM (GB)</th> +<th>&ldquo;Storage (GB, SSD)&rdquo;</th> +<th>Price USD/h</th> +<th>Queries/ sec.</th> +<th>Updates/ sec.</th> +<th>Queries/ USD</th> +<th>Updates/ USD</th> +<th>Updates/ vCPU</th> +</tr> +</thead> +<tbody> +<tr> +<td>m3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>15</td> +<td>2x 40</td> +<td>0.28</td> +<td>8.39</td> +<td>8.23</td> +<td>107 882</td> +<td>105 873</td> +<td>2.06</td> +</tr> +<tr> +<td>m3.2xlarge</td> +<td>8</td> +<td>8/2</td> +<td>30</td> +<td>2x 80</td> +<td>0.56</td> +<td>15.44</td> +<td>15.67</td> +<td>99 282</td> +<td>100 752</td> +<td>1.96</td> +</tr> +<tr> +<td>c3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>7.5</td> +<td>2x 40</td> +<td>0.21</td> +<td>7.17</td> +<td>6.78</td> +<td>122 890</td> +<td>116 292</td> +<td>1.7</td> +</tr> +<tr> +<td><strong>c3.2xlarge</strong></td> +<td><strong>8</strong></td> +<td><strong>8/2</strong></td> +<td><strong>15</strong></td> +<td><strong>2x 80</strong></td> +<td><strong>0.42</strong></td> +<td><strong>16.46</strong></td> +<td><strong>14.56</strong></td> +<td><strong>141 107</strong></td> +<td><strong>124 839</strong></td> +<td><strong>1.82</strong></td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>8/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>23.23</strong></td> +<td><strong>21.17</strong></td> +<td><strong>99 578</strong></td> +<td><strong>90 736</strong></td> +<td><strong>1.32</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>8/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>22.89</td> +<td>20.39</td> +<td>98 100</td> +<td>87 386</td> +<td>1.27</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/2</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.6</td> +<td>19.11</td> +<td>114 000</td> +<td>81 900</td> +<td>1.19</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.19</td> +<td>19.18</td> +<td>112 243</td> +<td>82 200</td> +<td>1.2</td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>14/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>30.84</strong></td> +<td><strong>16.88</strong></td> +<td><strong>132 171</strong></td> +<td><strong>72 343</strong></td> +<td><strong>1.06</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>14/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>29.67</td> +<td>17.8</td> +<td>127 157</td> +<td>76 286</td> +<td>1.11</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.11</td> +<td>32.04</td> +<td>156 712</td> +<td>135 302</td> +<td>1</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.31</td> +<td>32.07</td> +<td>157 557</td> +<td>135 429</td> +<td>1</td> +</tr> +<tr> +<td><strong>Local</strong></td> +<td><strong>32</strong></td> +<td><strong>10/2</strong></td> +<td><strong>256</strong></td> +<td><strong>8x 256</strong></td> +<td><strong>0.85</strong></td> +<td><strong>40</strong></td> +<td><strong>31.01</strong></td> +<td><strong>168 916</strong></td> +<td><strong>130 952</strong></td> +<td><strong>0.97</strong></td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.39</td> +<td>26.42</td> +<td>153 672</td> +<td>111 569</td> +<td>0.83</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.22</td> +<td>26.39</td> +<td>152 954</td> +<td>111 443</td> +<td>0.82</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>20/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>34.59</td> +<td>23.86</td> +<td>146 070</td> +<td>100 759</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<h3 id="the-optimal-number-of-test-agents">The Optimal Number of Test Agents</h3> +<p>Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:</p> +<ul> +<li>There is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads;</li> +<li>Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17);</li> +<li>Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations;</li> +<li>For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server;</li> +<li>Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization.</li> +<li>There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates.</li> +</ul> +<h3 id="efficiency-and-cost">Efficiency and Cost</h3> +<p>AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.</p> +<p>Cloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.</p> +<p>$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!</p> +<p>The full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.</p> +<p>To put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:</p> +<ul> +<li><strong>100 queries/sec.</strong> handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events.</li> +<li><strong>10 updates/sec.</strong> - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour.</li> +<li><strong>$81/day</strong> is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times.</li> +</ul> + + + + + DATAGEN: a Realistic Social Network Data Generator + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + Sat, 06 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/getting-started-with-snb">Getting started with snb</a>, <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">DATAGEN: data generation for the Social Network Benchmark</a>), Arnau Prat discussed the main features and characteristics of DATAGEN: <em>realism</em>, <em>scalability</em>, <em>determinism</em>, <em>usability</em>. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the <a href="https://github.com/ldbc/ldbc_snb_datagen">instructions for generating a SNB dataset</a> and <a href="https://github.com/ldbc/ldbc_snb_implementations/tree/master/interactive/virtuoso">for loading the dataset into Virtuoso</a>. In the following sections, we analyze several aspects of the generated dataset.</p> +<h3 id="a-realistic-social-graph">A Realistic social graph</h3> +<p>One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their <em><knows></em> relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about <a href="https://www.facebook.com/notes/facebook-data-team/anatomy-of-facebook/10150388519243859">Facebook Anatomy</a>). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook&rsquo;s graph distribution.</p> +<p><img src="Cumulative-distribution.png" alt="image"> <br> +Figure 1: Cumulative distribution #friends per user</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-r" data-lang="r"><span style="display:flex;"><span><span style="color:#75715e">#R script for generating the social degree distribution </span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">#Input files: person_knows_person_*.csv</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(data.table) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(igraph) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(plotrix) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">require</span>(bit64) +</span></span><span style="display:flex;"><span>dflist <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">lapply</span>(<span style="color:#a6e22e">commandArgs</span>(trailingOnly <span style="color:#f92672">=</span> <span style="color:#66d9ef">TRUE</span>), fread, sep<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;|&#34;</span>, +</span></span><span style="display:flex;"><span> header<span style="color:#f92672">=</span>T, select<span style="color:#f92672">=</span><span style="color:#ae81ff">1</span><span style="color:#f92672">:</span><span style="color:#ae81ff">2</span>, colClasses<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;integer64&#34;</span>) +</span></span><span style="display:flex;"><span> df <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">rbindlist</span>(dflist) <span style="color:#a6e22e">setNames</span>(df, <span style="color:#a6e22e">c</span>(<span style="color:#e6db74">&#34;P1&#34;</span>, <span style="color:#e6db74">&#34;P2&#34;</span>)) +</span></span><span style="display:flex;"><span>d2 <span style="color:#f92672">&lt;-</span> df[,<span style="color:#a6e22e">length</span>(P2),by<span style="color:#f92672">=</span>P1] +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">pdf</span>(<span style="color:#e6db74">&#34;socialdegreedist.pdf&#34;</span>) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">plot</span>(<span style="color:#a6e22e">ecdf</span>(d2<span style="color:#f92672">$</span>V1),main<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Cummulative distribution #friends per user&#34;</span>, +</span></span><span style="display:flex;"><span> xlab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Number of friends&#34;</span>, ylab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Percentage number of users&#34;</span>, log<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;x&#34;</span>, +</span></span><span style="display:flex;"><span> xlim<span style="color:#f92672">=</span><span style="color:#a6e22e">c</span>(<span style="color:#ae81ff">0.8</span>, <span style="color:#a6e22e">max</span>(d2<span style="color:#f92672">$</span>V1) <span style="color:#f92672">+</span> <span style="color:#ae81ff">20</span>)) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">dev.off</span>() +</span></span></code></pre></div><h3 id="data-correlations">Data Correlations</h3> +<p>Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.</p> +<p><em>Which are the most popular names of a country?</em></p> +<p>We run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, <em>&lsquo;A_country_name&rsquo;</em> is the name of a particular country such as <em>&lsquo;Germany&rsquo;, &lsquo;Netherlands&rsquo;, or &lsquo;Vietnam&rsquo;</em>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> p_lastname, <span style="color:#66d9ef">count</span> (p_lastname) <span style="color:#66d9ef">as</span> namecnt +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">FROM</span> person, country +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> p_placeid <span style="color:#f92672">=</span> ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;A_country_name&#39;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> p_lastname <span style="color:#66d9ef">order</span> <span style="color:#66d9ef">by</span> namecnt <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as <em>Muller</em> is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names&rsquo; distribution may not be exactly the same as the contemporary names&rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.</p> +<p><img src="distribution-germany.png" alt="image"> <br> +Figure 2. Distribution of names in Germany</p> +<p><img src="distribution-netherlands.png" alt=""> <br> +Figure 3. Distribution of names in Netherlands</p> +<p><img src="distribution-vietnam.png" alt=""> <br> +Figure 4. Distribution of names in Vietnam</p> +<p><em>Where my friends are living?</em></p> +<p>We run the following query, which computes the locations of the friends of people living in China.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> fctry.ctry_name, <span style="color:#66d9ef">count</span> (<span style="color:#f92672">*</span>) <span style="color:#66d9ef">from</span> person <span style="color:#66d9ef">self</span>, person +</span></span><span style="display:flex;"><span>friend, country pctry, knows, country fctry +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> pctry.ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;China&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> <span style="color:#66d9ef">self</span>.p_placeid <span style="color:#f92672">=</span> pctry.ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> <span style="color:#66d9ef">self</span>.p_personid <span style="color:#66d9ef">and</span> friend.p_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> fctry.ctry_city <span style="color:#f92672">=</span> friend.p_placeid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> fctry.ctry_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.</p> +<p><img src="chinese-friends.png" alt=""> <br> +Figure 5. Locations of friends of people in China</p> +<p><em>Where my friends are studying?</em></p> +<p>Finally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> o2.o_name, <span style="color:#66d9ef">count</span>(o2.o_name) <span style="color:#66d9ef">from</span> knows, person_university +</span></span><span style="display:flex;"><span>p1, person_university p2, organisation o1, organisation o2 +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> +</span></span><span style="display:flex;"><span> p1.pu_organisationid <span style="color:#f92672">=</span> o1.o_organisationid +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> o1.o_name<span style="color:#f92672">=</span><span style="color:#e6db74">&#39;Hangzhou_International_School&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> p1.pu_personid <span style="color:#66d9ef">and</span> p2.pu_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> p2.pu_organisationid <span style="color:#f92672">=</span> o2.o_organisationid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> o2.o_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).</p> +<p><img src="friends-international-school.png" alt=""> <br> +Figure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.</p> +<table> +<thead> +<tr> +<th>Name</th> +<th># of friends</th> +</tr> +</thead> +<tbody> +<tr> +<td>Hangzhou_International_School</td> +<td>12696</td> +</tr> +<tr> +<td>Anhui_University_of_Science_and_Technology</td> +<td>4071</td> +</tr> +<tr> +<td>China_Jiliang_University</td> +<td>3519</td> +</tr> +<tr> +<td>&hellip;</td> +<td></td> +</tr> +<tr> +<td>Darmstadt_University_of_Applied_Sciences</td> +<td>1</td> +</tr> +<tr> +<td>Calcutta_School_of_Tropical_Medicine</td> +<td>1</td> +</tr> +<tr> +<td>Chettinad_Vidyashram</td> +<td>1</td> +</tr> +<tr> +<td>Women&rsquo;s_College_Shillong</td> +<td>1</td> +</tr> +<tr> +<td>Universitas_Nasional</td> +<td>1</td> +</tr> +</tbody> +</table> +<p>Table 1. Universities where friends of Hangzhou International School students are studying at.</p> +<p>In a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.</p> + + + + + SNB Driver - Part 1 + https://ldbcouncil.org/post/snb-driver-part-1/ + Thu, 27 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-1/ + <p>In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: <a href="https://github.com/ldbc/ldbc_driver/">https://github.com/ldbc/ldbc_driver/</a>. Multiple reference implementations by two vendors are available here: <a href="https://github.com/ldbc/ldbc_snb_implementations">https://github.com/ldbc/ldbc_snb_implementations</a>, and discussion of the schema, data properties, and related content is available here: <a href="https://github.com/ldbc/ldbc_snb_docs">https://github.com/ldbc/ldbc_snb_docs</a>.</p> +<p>The following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.</p> +<h3 id="problem-description">Problem Description</h3> +<p>The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person&rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.</p> +<p>Consider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.</p> +<p>The next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.</p> + + + + + Making Semantic Publishing Execution Rules + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + Tue, 18 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + <p><a href="https://ldbcouncil.org/">LDBC</a> <a href="https://ldbcouncil.org/benchmarks/spb">SPB (Semantic Publishing Benchmark)</a> is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC&rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an <a href="https://www.ontotext.com/products/ontotext-graphdb-owlim/">Ontotext Graph DB</a> deployment. Graph DB was formerly known as Owlim.</p> +<p>So, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.</p> +<p>SPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:</p> +<ul> +<li> +<p>Some queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.</p> +</li> +<li> +<p>Some queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.</p> +</li> +</ul> +<p>These different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.</p> +<p>Another guiding factor of SPB was the BBC&rsquo;s and others&rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.</p> +<p>Normally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:</p> +<ul> +<li> +<p>Updates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.</p> +</li> +<li> +<p>Short queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.</p> +</li> +<li> +<p>Analytics - These cover a large fraction of the dataset and are roughly linear to data size.</p> +</li> +</ul> +<p>A test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.</p> +<p>The benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:</p> +<ul> +<li> +<p>46886 MB of RDF literal text</p> +</li> +<li> +<p>23924 MB of full text index for RDF literals</p> +</li> +<li> +<p>23598 MB of URI strings</p> +</li> +<li> +<p>21981 MB of quads, stored column-wise with default index scheme</p> +</li> +</ul> +<p>Clearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.</p> +<p>Let&rsquo;s now look at a full run at unit scale, i.e. 50M triples.</p> +<p>The run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.</p> +<p>The interactive driver reports:</p> +<pre tabindex="0"><code>Seconds run : 2144 + Editorial: + 2 agents + + 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) + 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) + 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) + + 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) + 39.7122 average operations per second + + Aggregation: + 20 agents + + 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) + 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) + 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) + 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) + 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) + 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) + 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) + 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) + 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) + 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) + 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) + + 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] + + 45318 total retrieval queries (0 timed-out) + 22.5239 average queries per second +</code></pre><p>The analytical driver reports:</p> +<pre tabindex="0"><code>Aggregation: + 2 agents + + 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) + 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) + 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) + 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) + 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) + 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) + 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) + 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) + 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) + 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) + 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) + 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) + 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) + 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) + 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] + + 180 total retrieval queries (2 timed-out) + 0.0862 average queries per second +</code></pre><p>The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)</p> +<p>The SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (<a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack at github.com</a>, <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics</a>).</p> +<p>The minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.</p> +<p>Is this good or bad? I would say that this is possible but better can certainly be accomplished.</p> +<p>The initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.</p> +<p>As an initial comment of the workload mix, I&rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.</p> +<p>Adjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.</p> +<p>In the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.</p> + + + + + Fifth TUC Meeting + https://ldbcouncil.org/event/fifth-tuc-meeting/ + Fri, 14 Nov 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fifth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its fifth Technical User<br> +Community (TUC) meeting.</p> +<p>This will be a one-day event at the National Hellenic Research Institute<br> +in Athens, Greece on <strong>Friday November 14, 2014</strong>.</p> +<h3 id="agenda">Agenda</h3> +<p>10:30 - 11:00 Coffee Break</p> +<p>11:00 - 11:10 Peter Boncz (VUA) Welcome &amp; LDBC project status update (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979841.pptx">Presentation</a>)</p> +<p>11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status</p> +<p>Feedback &amp; Roadmap for SPB &amp; OWLIM (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979839.pdf">Presentation</a>)</p> +<p>11:25 - 11:30 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SPB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979828.pdf">Presentation</a>)</p> +<p>11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback &amp; Roadmap for SNB &amp; Neo4J (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979830.pdf">Presentation</a>)</p> +<p>11:45 - 12:00 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SNB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979829.pdf">Presentation</a>)</p> +<p>12:00 - 12:20 Arnau Prat (UPC) &amp; Andrey Gubichev Status, Feedback &amp; Roadmap for SNB Interactive &amp; Sparksee (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979836.pdf">Presentation</a> ) and Business Intelligence (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979837.pdf">Presentation</a>)</p> +<p>12:20 - 12:40 Tomer Sagi, &ldquo;Experience with SNB and TitanDB at HP&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979838.pptx">Presentation</a> )</p> +<p>12:40 - 13:00 Jakob Nelson, &ldquo;graphbench.org on the SNB datagen&rdquo;</p> +<p>13:00 - 14:30 Lunch Break@Byzantine &amp; Christian Museum (<a href="http://www.byzantinemuseum.gr/en/">link</a>)</p> +<p>14:30 - 14:50 Olaf Hartig, &ldquo;Integrating the Property Graph and RDF data models&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979831.pdf">Presentation</a>)\</p> +<p>Documents: <a href="http://arxiv.org/abs/1409.3288">arxiv/1409.3288</a>, <a href="http://arxiv.org/abs/1406.3399">arxiv/1406.3399</a></p> +<p>14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, &ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979842.pdf">Presentation</a>)</p> +<p>15:10 - 15:30 Evaggelia Pitoura, &ldquo;Historical Queries on Graphs&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979835.pdf">Presentation</a>)</p> +<p>15:30 - 16:00 Coffee Break</p> +<p>16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, &ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979843.pdf">Presentation</a>)</p> +<p>16:20 - 16:40 Gunes Aluc, &ldquo;WatDiv: How to Tune-up your RDF Data Management System&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979832.pdf">Presentation</a>)</p> +<p>16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, &ldquo;Benchmarking @LogicBlox&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979840.pdf">Presentation</a>)</p> +<p>17:00 - 17:15 Hassan Chafi, &ldquo;Oracle Labs Graph Strategy&rdquo;</p> +<p>17:15 - 17:25 Yinglong Xia, &ldquo;Property Graphs for Industry Solution at IBM&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979834.pdf">Presentation</a>)</p> +<p>17:25 - 17:30 Arthur Keen, &ldquo;Short Introduction to SPARQLcity&rdquo;</p> +<p><em><strong>20:30 Dinner @ Konservokouti <a href="https://plus.google.com/114240752029716758955/about?gl=gr&amp;hl=en">(link)</a></strong></em></p> +<p><em><strong>Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion</strong></em></p> +<h4 id="logistics">Logistics</h4> +<p>The meeting will be held at the <a href="http://www.eie.gr/index-en.html">National Hellenic Research Foundation</a> located in <a href="http://www.eie.gr/location-en.html">downtown Athens</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/5964344.gif" alt=""></p> +<h4 id="travel">Travel</h4> +<p>Athens, Greece&rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.</p> +<p>To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).</p> +<p>You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: <a href="http://www.aia.gr/traveler/">http://www.aia.gr/traveler/</a></p> + + + + + Getting Started With the Semantic Publishing Benchmark + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + Sun, 09 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + <p>The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.</p> +<p>The benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:</p> +<ul> +<li>clustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time)</li> +<li>correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time)</li> +</ul> +<p>The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.</p> +<p>Currently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:</p> +<ul> +<li>join ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema</li> +<li>subselects that aggregate the query results that the optimiser should recognise and evaluate first</li> +<li>optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last</li> +<li>reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.)</li> +<li>unions to be executed in parallel</li> +<li>optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results</li> +<li>ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results</li> +<li>handling of geo-spatial predicates</li> +<li>full-text search optimisation</li> +<li>asynchronous execution of the aggregate sub-queries</li> +<li>use of distinct to choose the optimal query plan</li> +</ul> +<p>We give below Query 1 of the Semantic Publishing Benchmark.</p> +<pre tabindex="0"><code>PREFIX bbcevent:&lt;http://www.bbc.co.uk/ontologies/event/&gt; +PREFIX geo-pos:&lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; +PREFIX bbc:&lt;http://www.bbc.co.uk/ontologies/bbc/&gt; +PREFIX time:&lt;http://www.w3.org/2006/time#&gt; +PREFIX event:&lt;http://purl.org/NET/c4dm/event.owl#&gt; +PREFIX music-ont:&lt;http://purl.org/ontology/mo/&gt; +PREFIX rdf:&lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt; +PREFIX foaf:&lt;http://xmlns.com/foaf/0.1/&gt; +PREFIX provenance:&lt;http://www.bbc.co.uk/ontologies/provenance/&gt; +PREFIX owl:&lt;http://www.w3.org/2002/07/owl#&gt; +PREFIX cms:&lt;http://www.bbc.co.uk/ontologies/cms/&gt; +PREFIX news:&lt;http://www.bbc.co.uk/ontologies/news/&gt; +PREFIX cnews:&lt;http://www.bbc.co.uk/ontologies/news/cnews/&gt; +PREFIX cconcepts:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX dbp-prop:&lt;http://dbpedia.org/property/&gt; +PREFIX geonames:&lt;http://sws.geonames.org/&gt; +PREFIX rdfs:&lt;http://www.w3.org/2000/01/rdf-schema#&gt; +PREFIX domain:&lt;http://www.bbc.co.uk/ontologies/domain/&gt; +PREFIX dbpedia:&lt;http://dbpedia.org/resource/&gt; +PREFIX geo-ont:&lt;http://www.geonames.org/ontology#&gt; +PREFIX bbc-pont:&lt;http://purl.org/ontology/po/&gt; +PREFIX tagging:&lt;http://www.bbc.co.uk/ontologies/tagging/&gt; +PREFIX sport:&lt;http://www.bbc.co.uk/ontologies/sport/&gt; +PREFIX skosCore:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX dbp-ont:&lt;http://dbpedia.org/ontology/&gt; +PREFIX xsd:&lt;http://www.w3.org/2001/XMLSchema#&gt; +PREFIX core:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX curric:&lt;http://www.bbc.co.uk/ontologies/curriculum/&gt; +PREFIX skos:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX cwork:&lt;http://www.bbc.co.uk/ontologies/creativework/&gt; +PREFIX fb:&lt;http://rdf.freebase.com/ns/&gt; + +# Query Name : query1 +# Query Description : +# Retrieve creative works about thing t (or that mention t) +# reasoning: rdfs:subClassOf, rdf:type +# join ordering: cwork:dateModified rdf:type owl:FunctionalProperty +# join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty +# Choke Points : +# - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified +# Optimizer should use an efficient cost evaluation method for choosing the optimal join tree +# - A sub-select which aggregates results. Optimizer should recognize it and execute it first +# - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) +# Optimizer should decide to put optional triples on top of the join tree +# (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join +# - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork +# and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ + +CONSTRUCT { + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:shortTitle ?shortTitle ; + cwork:about ?about ; + cwork:mentions ?mentions ; + cwork:dateCreated ?created ; + cwork:dateModified ?modified ; + cwork:description ?description ; + cwork:primaryFormat ?primaryFormat ; + bbc:primaryContentOf ?webDocument . + ?webDocument bbc:webDocumentType ?webDocType . + ?about rdfs:label ?aboutLabel ; + bbc:shortLabel ?aboutShortLabel ; + bbc:preferredLabel ?aboutPreferredLabel . + ?mentions rdfs:label ?mentionsLabel ; + bbc:shortLabel ?mentionsShortLabel ; + bbc:preferredLabel ?mentionsPreferredLabel . + ?creativeWork cwork:thumbnail ?thumbnail . + ?thumbnail a cwork:Thumbnail ; + cwork:altText ?thumbnailAltText ; + cwork:thumbnailType ?thumbnailType . +} +WHERE { + { + SELECT ?creativeWork + WHERE { + ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . + ?creativeWork a cwork:CreativeWork ; + cwork:dateModified ?modified . + } + ORDER BY DESC(?modified) + LIMIT 10 + } + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:dateModified ?modified . + OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } + OPTIONAL { ?creativeWork cwork:description ?description . } + OPTIONAL { ?creativeWork cwork:about ?about . + OPTIONAL { ?about rdfs:label ?aboutLabel . } + OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } + OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } + } + OPTIONAL { + ?creativeWork cwork:mentions ?mentions . + OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } + OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } + OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } + } + OPTIONAL { ?creativeWork cwork:dateCreated ?created . } + OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } + OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . + OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } + OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } + } +} +</code></pre><p>Listing 1. Semantic Publishing Benchmark: Query 1</p> +<p>The benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.</p> +<p>The data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.</p> +<p>The generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).</p> +<p>Below we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.</p> +<pre tabindex="0"><code>Seconds run : 600 + Editorial: + 0 agents + + 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) + 0.0000 average operations per second + + Aggregation: + 8 agents + + 298 Q1 queries + 267 Q2 queries + 243 Q3 queries + 291 Q4 queries + 320 Q5 queries + 286 Q6 queries + 255 Q7 queries + 274 Q8 queries + 271 Q9 queries + + 2505 total retrieval queries + 4.1750 average queries per second +</code></pre><p>Listing 2. A snippet of semantic_publishing_benchmark_results.log</p> +<p>We run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:</p> +<table> +<thead> +<tr> +<th>#triples</th> +<th>Q1</th> +<th>Q2</th> +<th>Q3</th> +<th>Q4</th> +<th>Q5</th> +<th>Q6</th> +<th>Q7</th> +<th>Q8</th> +<th>Q9</th> +<th>#queries</th> +<th>avg. #q. per sec.</th> +</tr> +</thead> +<tbody> +<tr> +<td>10M</td> +<td>298</td> +<td>267</td> +<td>243</td> +<td>291</td> +<td>320</td> +<td>286</td> +<td>255</td> +<td>274</td> +<td>271</td> +<td>2505</td> +<td>41,750</td> +</tr> +<tr> +<td>100M</td> +<td>53</td> +<td>62</td> +<td>51</td> +<td>52</td> +<td>44</td> +<td>62</td> +<td>25</td> +<td>55</td> +<td>45</td> +<td>449</td> +<td>7,483</td> +</tr> +<tr> +<td>1B</td> +<td>34</td> +<td>29</td> +<td>22</td> +<td>24</td> +<td>25</td> +<td>29</td> +<td>0</td> +<td>29</td> +<td>28</td> +<td>220</td> +<td>3,667</td> +</tr> +</tbody> +</table> + + + + + Choke Point Based Benchmark Design + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + <p>The <em>Linked Data Benchmark Council</em> (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its <a href="https://ldbcouncil.org">https://ldbcouncil.org</a> website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (<a href="https://ldbcouncil.org/event/fifth-tuc-meeting">next TUC meeting</a> will be on October 5 in Athens) and indeed in <em>designing benchmarks</em>.</p> +<p>So, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by <a href="http://www.tpc.org/">TPC</a> have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are <em>relevant</em> and <em>representative</em> (address important challenges encountered in practice), <em>understandable</em> , <em>economical</em> (implementable on simple hardware), <em>fair</em> (such as not to favor a particular product or approach), <em>scalable</em>, <em>accepted</em> by the community and <em>public</em> (e.g. all of its software is available in open source). This list stems from Jim Gray&rsquo;s <a href="http://research.microsoft.com/en-us/um/people/gray/BenchmarkHandbook/TOC.htm">Benchmark Handbook</a>. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.</p> +<p>A very important aspect of benchmark development is making sure that the community <em>accepts</em> a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on <strong>fairness</strong> had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its <a href="https://ldbcouncil.org/tags/tuc-meeting/">Technical User Community gatherings</a>) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.</p> +<p>The need for <em>understandability</em> for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.</p> +<p>The <em>economical</em> aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.</p> +<p><em>Representative</em> benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the <a href="http://aksw.org/Projects/DBPSB.html">DBpedia benchmark</a> whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).</p> +<p>The fact that a benchmark can be <em>scaled</em> in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) &ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.</p> +<p>Now, what makes a benchmark <em>relevant</em>? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of <em>&ldquo;choke points&rdquo;</em>: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.</p> +<p>To illustrate what choke points are in more depth, we wrote a <a href="https://ldbcouncil.org/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf">paper in the TPCTC 2013</a> conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. <em><a href="chokepoints.png">This table</a></em> lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):</p> +<p>I would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can <a href="http://www.openlinksw.com/weblog/oerling/?id=1779">trivialize Q18</a>; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.</p> +<p>LDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark <a href="https://ldbcouncil.org/developer/spb">(SPB)</a> and the more graph-focused Social Network Benchmark (<a href="https://ldbcouncil.org/developer/snb">SNB</a>), and <a href="https://groups.google.com/forum/#!forum/ldbcouncil">tell us what you think</a>. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.</p> +<p><em>(for more posts from Peter Boncz, see also <a href="https://databasearchitects.blogspot.com">Database Architects</a>, a blog about data management challenges and techniques written by people who design and implement database systems)</em></p> + + + + + New Website Online LDBC Benchmarks Reach Public Draft + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + <p>The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.</p> +<p>The Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth <a href="https://ldbcouncil.org/event/fifth-tuc-meeting">Technical User Community meeting</a>. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.</p> +<p>You can also see that we created a this new website and a new logo. This website is different from <code>http://ldbc.eu</code> that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.</p> +<p>In the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.</p> + + + + + Social Network Benchmark Goals + https://ldbcouncil.org/post/social-network-benchmark-goals/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/social-network-benchmark-goals/ + <p>Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.</p> +<p>From a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.</p> +<p>With the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, <a href="https://ldbcouncil.org/benchmarks/snb">SNB</a>, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.</p> +<p>The SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">LDBC Github repository</a>. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD programming contest 2014</a>.</p> +<p>The SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.</p> +<p>More details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.</p> + + + + + Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + <p>It is with great pleasure that we announce the new LDBC organisation site at <a href="https://www.ldbcouncil.org">www.ldbcouncil.org</a>. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the <em>benchmarks</em> menu on this site.</p> +<p>Those benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.</p> +<p>While the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.</p> +<p>We want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.</p> +<p>Finally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.</p> +<p>In all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.</p> + + + + + 2nd International Workshop on Benchmarking RDF Systems + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + <p>Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.</p> +<p>More at: <a href="http://events.sti2.at/bersys2014/">http://events.sti2.at/bersys2014/</a></p> + + + + + DATAGEN: Data Generation for the Social Network Benchmark + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + <p>As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 <a href="#references">[1]</a>.</p> +<p>One of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.</p> +<p>For these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:</p> +<ul> +<li><strong>Realism.</strong> The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook <a href="#references">[2]</a>. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated.</li> +<li><strong>Scalability.</strong> Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters.</li> +<li><strong>Determinism.</strong> DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible.</li> +<li><strong>Usability.</strong> LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible.</li> +</ul> +<p>Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (<a href="https://github.com/ldbc/ldbc_snb_datagen)">https://github.com/ldbc/ldbc_snb_datagen)</a>.</p> +<h4 id="references">References</h4> +<p>[1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. &ldquo;S3g2: A scalable structure-correlated social graph generator.&rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.</p> +<p>[2] Prat-Pérez, Arnau, and David Dominguez-Sal. &ldquo;How community-like is the structure of synthetically generated graphs?.&rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.</p> + + + + + Getting Started With SNB + https://ldbcouncil.org/post/getting-started-with-snb/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-snb/ + <p>In a previous blog post titled &ldquo;<a href="https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/">Is SNB like Facebook&rsquo;s LinkBench?</a>&rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.</p> +<h3 id="datagen">DATAGEN</h3> +<p>DATAGEN is the data generator used by all the workloads of SNB. <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/">Here</a> we introduced the design goals that drive the development of DATAGEN, which can be summarized as: <em>Realism, Scalability, Determinism and Usability.</em></p> +<p>DATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.</p> +<p><img src="schema.png" alt="image"></p> +<p>For the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:</p> +<ul> +<li> +<p>Realistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.</p> +</li> +<li> +<p>Correlated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.</p> +</li> +</ul> +<p>DATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.</p> +<p>Finally, DATAGEN outputs two other things:</p> +<ul> +<li> +<p>Update Streams, which will be used in the future to implement updates in the workloads.</p> +</li> +<li> +<p>Substitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.</p> +</li> +</ul> +<p>Configuring and using DATAGEN is easy. Please visit <a href="https://github.com/ldbc/ldbc_snb_datagen">this page</a> for more information.</p> +<h3 id="ldbc-driver">LDBC driver</h3> +<p>SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.</p> +<p>It is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation <a href="https://github.com/ldbc/ldbc_driver/wiki">page</a>.</p> +<p>The test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.</p> +<p>Given a workload consisting of a series of <em>Operations</em>, the test sponsor implements <em>OperationHandlers</em> __ for them. <em>OperationHandlers</em> are responsible of executing instances of an specific operation (query) type. This is done by overriding the method <em>executeOperation</em>(), which receives as input parameter an <em>Operation</em> instance and returns the result. From <em>Operation</em> __ instance, the operation&rsquo;s input parameters can be retrieved, as well as the database connection state.</p> +<p>The database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the <em>Db</em> interface, which consists of three methods: <em>onInit</em>(), <em>onCleanup</em>() and <em>getConnectionState</em>(). <em>onInit</em>() is called before the benchmark is executed, and is responsible of initializing the database and registering the different <em>OperationHandlers</em>. <em>onCleanup</em>() is called after the benchmark has completed. Any resources that need to be released should be released here.</p> +<p>Finally, <em>getConnectionState</em>() returns an instance of <em>DbConnectionState</em>, which encapsulates any state that needs to be shared between <em>OperationHandler</em> instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.</p> +<p>A good example on how to implement the benchmark can be found <a href="https://github.com/ldbc/ldbc_driver/wiki/Implementing%20a%20Database%20Connector">here</a>.</p> +<h3 id="workloads">Workloads</h3> +<p>Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.</p> +<p>Interactive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.</p> +<p>Business Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.</p> +<p>Examples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.</p> +<p>Finally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.</p> +<h3 id="final-remarks">Final remarks</h3> +<p>This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification <a href="https://github.com/ldbc/ldbc_snb_docs">draft</a>, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.</p> + + + + + Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + <p>The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.</p> +<p>The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user&rsquo;s social environment and potentially access data associated with the friends or a user and their friends.</p> +<p>This is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.</p> +<p>SNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.</p> +<p>The challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.</p> +<p>The dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.</p> +<p>The metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.</p> +<p>Different technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API&rsquo;s.</p> +<p>SNB Interactive is an example of LDBC&rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art</p> +<p>The benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.</p> +<p>Subsequent posts will address the workload in more detail.</p> + + + + + Is SNB Like Facebooks LinkBench + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + <p>In this post, I will discuss in some detail the rationale and goals of the design of the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook&rsquo;s own graph benchmark called <a href="https://www.facebook.com/notes/facebook-engineering/linkbench-a-database-benchmark-for-the-social-graph/10151391496443920">LinkBench</a>. We think SNB is the most intricate graph database benchmark to date (it&rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference <a href="http://www.sigmod2014.org/">SIGMOD in Snowbird</a> after being used for this year&rsquo;s <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD Programming Contest</a>, which was about graph analytics.</p> +<p>SNB is intended to provide the following <strong>value</strong> to different stakeholders:</p> +<ul> +<li> +<p>For end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to <em>compare merits of different products</em> and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.</p> +</li> +<li> +<p>For vendors of graph database technology, SNB provides a <em>checklist of features</em> and performance characteristics that helps in product positioning and can serve to guide new development.</p> +</li> +<li> +<p>For researchers, both industrial and academic, the SNB dataset and workload provide <em>interesting challenges</em> in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.</p> +</li> +</ul> +<p>I should clarify that even though the data model of SNB resembles Facebook (and we&rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don&rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the <em>connection patterns</em> between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:</p> +<ul> +<li> +<p>the benchmark scenario should be <strong>understandable</strong> to a large audience, and this audience should also understand the relevance of managing such data.</p> +</li> +<li> +<p>the scenario in the benchmark should cover the complete range of challenges <strong>relevant</strong> for graph data management, according to the benchmark scope.</p> +</li> +<li> +<p>the query challenges in it should be <strong>realistic</strong> in the sense that, though synthetic, similar data and workloads are encountered in practice.</p> +</li> +</ul> +<p>The SNB is in fact three distinct benchmarks with a common dataset, since there are <em>three different workloads</em>. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.</p> +<ul> +<li> +<p><strong>Interactive Workload.</strong> The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j&rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system&rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a> and <a href="https://github.com/ldbc/ldbc_driver">driver software stack</a> are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.</p> +</li> +<li> +<p><strong>Business Intelligence Workload.</strong> There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.</p> +</li> +<li> +<p><strong>Graph Analytics Workload.</strong> This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.</p> +</li> +</ul> +<p>All the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a>. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to <a href="http://people.cs.uchicago.edu/~tga/pubs/sigmod-linkbench-2013.pdf">LinkBench</a>, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the <strong>low-level</strong> MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that <a href="http://borthakur.com/ftp/sigmod2013.pdf">Facebook&rsquo;s IT infrastructure</a> does not store all user data in MySQL and its modified memcached (&quot;<a href="http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/11730-atc13-bronson.pdf">TAO</a>&quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it <strong>does</strong> matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with <em>correlations</em>, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a &ldquo;choke point&rdquo; for graph data management system where LDBC wants to stimulate innovation).</p> + + + + + Making It Interactive + https://ldbcouncil.org/post/making-it-interactive/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-it-interactive/ + <p><em>Synopsis:</em> Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.</p> +<p>It is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.</p> +<p>As part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.</p> +<p>Now, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.</p> +<p>So, LDBC SNB has a twofold task:</p> +<ol> +<li>In order to be a credible interactive workload, it must in fact have characteristics of one</li> +<li>In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics.</li> +</ol> +<p>Designing a workload presents specific challenges:</p> +<ol> +<li>The workload must be realistic enough for users to identify with it.</li> +<li>The workload must pose challenges and drive innovation in a useful direction.</li> +<li>The component operations must all play a noticeable role in it. If the operation&rsquo;s relative performance doe does not affect the score, why is it in the workload?</li> +</ol> +<p>The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.</p> +<p>Very roughly, the choke points (technical challenges) of SNB interactive are as follows:</p> +<ul> +<li>Random access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored.</li> +<li>Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join.</li> +<li>When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations.</li> +<li>Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities.</li> +</ul> +<p>Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.</p> +<p>In online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.</p> +<p>A key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.</p> +<p>The other aspect is the metric, typically some variation on operations per unit of time.</p> +<p>All these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.</p> +<p>So how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.</p> +<p>But SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.</p> +<p>Having this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:</p> +<ul> +<li>Short queries should collectively be about 45% of the CPU load.</li> +<li>Updates will be under 5%</li> +<li>Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others.</li> +</ul> +<p>The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.</p> +<p>There is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.</p> +<p>In the next post we will look at the actual mix and execution times on the test system.</p> + + + + + SNB Data Generator - Getting Started + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">this</a> and <a href="https://ldbcouncil.org/post/getting-started-with-snb">this</a>) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.</p> +<h3 id="getting-and-configuring-hadoop">Getting and Configuring Hadoop</h3> +<p>DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz hadoop-1.2.1.tar.gz +</span></span></code></pre></div><p>For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/wiki/Configuration">LDBC DATAGEN wiki</a>.</p> +<h3 id="getting-and-configuring-datagen">Getting and configuring DATAGEN</h3> +<p>Before downloading DATAGEN, be sure to fulfill the following requirements:</p> +<ul> +<li>Linux based machine</li> +<li>java 1.6 or greater</li> +<li>python 2.7.X</li> +<li>maven 3</li> +</ul> +<p>After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/releases">here</a>. Again, decompress the downloaded file with the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz +</span></span></code></pre></div><p>This will create a folder called “ldbc_snb_datagen-0.1.2”.</p> +<p>DATAGEN provides a <em>run.sh</em> is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>HADOOP_HOME<span style="color:#f92672">=</span>/home/user/hadoop-1.2.1 +</span></span><span style="display:flex;"><span>LDBC_SNB_DATAGEN_HOME<span style="color:#f92672">=</span>/home/user/ldbc_snb_datagen +</span></span></code></pre></div><p>HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute <em>run.sh</em> script to compile and execute DATAGEN using default parameters. Type the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user/ldbc_snb_datagen-0.1.2 +</span></span><span style="display:flex;"><span>$ ./run.sh +</span></span></code></pre></div><p>This will run DATAGEN, and two folders will be created at the same directory: <em>social_network</em> containing the scale factor 1 dataset with csv uncompressed files, and <em>substitution_parameters</em> containing the substituion parameters needed by the driver to execute the benchmark.</p> +<h3 id="changing-the-generated-dataset">Changing the generated dataset</h3> +<p>The characteristics of the dataset to be generated are specified in the <em>params.ini</em> file. By default, this file has the following content:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:1</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:1</span> +</span></span></code></pre></div><p>The following is the list of options and their default values supported by DATAGEN:</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>scaleFactor</td> +<td>1</td> +<td>&ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000&rdquo;</td> +</tr> +<tr> +<td>serializer</td> +<td>csv</td> +<td>&ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl&rdquo;</td> +</tr> +<tr> +<td>compressed</td> +<td>FALSE</td> +<td>Specifies to compress the output data in gzip.</td> +</tr> +<tr> +<td>outputDir</td> +<td>./</td> +<td>Specifies the folder to output the data.</td> +</tr> +<tr> +<td>updateStreams</td> +<td>FALSE</td> +<td>&ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static&rdquo;</td> +</tr> +<tr> +<td>numThreads</td> +<td>1</td> +<td>Sets the number of threads to use. Only works for pseudo-distributed mode</td> +</tr> +</tbody> +</table> +<p>For instance, a possible <em>params.ini</em> file could be the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:30</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:ttl</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>numPersons</td> +<td>-</td> +<td>The number of persons to generate</td> +</tr> +<tr> +<td>numYears</td> +<td>-</td> +<td>The amount of years of activity</td> +</tr> +<tr> +<td>startYear</td> +<td>-</td> +<td>The start year of simulation.</td> +</tr> +</tbody> +</table> +<p>The following is an example of another possible <em>params.ini</em> file</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">numPersons:100000</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numYears:3</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">startYear:2010</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv_merge_foreign</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/">GitHub</a>!</p> + + + + + The Day of Graph Analytics + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + <p><em>Note: consider this post as a continuation of the &ldquo;<a href="https://ldbcouncil.org/post/making-it-interactive">Making it interactive</a>&rdquo; post by Orri Erling.</em></p> +<p>I have now completed the <a href="https://github.com/openlink/virtuoso-opensource">Virtuoso</a> TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. <a href="http://www.tpc.org/tpch/">TPC-H</a> is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.</p> +<p>So, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.</p> +<p>The BI part is like TPC-H, except for adding the following challenges:</p> +<ul> +<li> +<p>Joins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.</p> +</li> +<li> +<p>Transitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.</p> +</li> +<li> +<p>Transitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.</p> +</li> +<li> +<p>Graph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.</p> +</li> +<li> +<p>Running one query with parameters from different buckets, implying different best plan.</p> +</li> +<li> +<p>Data correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.</p> +</li> +<li> +<p>Large intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.</p> +</li> +<li> +<p>More unions and outer joins.</p> +</li> +</ul> +<p>The idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.</p> +<p>For rules and metric, we can use the TPC-H or <a href="http://www.tpc.org/tpcds/default.asp">TPC-DS</a> ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and <a href="http://www.openstreetmap.org/">Open Street Map</a>. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.</p> +<p>Doing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.</p> +<p>As a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or &ldquo;Platonic&rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.</p> +<p>So, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.</p> +<p>The distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.</p> +<p>This brings us to the graph analytics proper, which is often done in BSP style, e.g. <a href="http://es.slideshare.net/shatteredNirvana/pregel-a-system-for-largescale-graph-processing">Pregel</a>, <a href="http://giraph.apache.org">Giraph</a>, <a href="http://uzh.github.io/signal-collect/">Signal-Collect</a>, some but not all <a href="http://ppl.stanford.edu/main/green_marl.html">Green-Marl</a> applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.</p> +<p>With BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as <a href="http://graphbench.org/">graphbench.org</a> but will be adapted to the SNB dataset.</p> +<p>The analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.</p> + + + + + Using LDBC SPB to Find OWLIM Performance Issues + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + Wed, 20 Aug 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + <p>During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (LDBC-SPB) as a part of our development and release process.</p> +<p>First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.</p> +<p>Initially we’ve decided to fix some of the benchmark parameters :</p> +<ul> +<li>the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run.</li> +</ul> +<p>Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.</p> +<p>We’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.</p> +<p>Following two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.</p> +<p><img src="16-2-Performance.png" alt="image"></p> +<p>Figure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.</p> +<p><img src="8-0-Performance.png" alt="image"></p> +<p>Figure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.</p> +<p>Another thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark&rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.</p> +<p>An additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called <em>checkConformance</em>. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.</p> + + + + + Fourth TUC meeting + https://ldbcouncil.org/event/fourth-tuc-meeting/ + Thu, 03 Apr 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fourth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.</p> +<p>This will be a one-day event at CWI in Amsterdam on <em>Thursday April 3, 2014</em>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<p><strong>For presenters please limit your talks to just 15 minutes</strong></p> +<h3 id="agenda">Agenda</h3> +<p><strong>April 3rd</strong></p> +<ul> +<li> +<p>10:00 Peter Boncz (VUA) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506371.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=JYWVgrP1kVY">video</a>: <em>LDBC project status update</em></p> +</li> +<li> +<p>10:20 Norbert Martinez (UPC) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506375.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=4yREJQ3yDr0">video</a>: <em>Status update on the LDBC Social Network Benchmark (SNB) task force</em>.</p> +</li> +<li> +<p>10:50 Alexandru Iosup (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506363.ppt">ppt</a>, <a href="https://www.youtube.com/watch?v=ulT-RFwKpOE">video</a>: <em>Towards Benchmarking Graph-Processing Platforms</em></p> +</li> +<li> +<p>11:10 Mike Bryant (Kings College) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506364.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=KiHRTu9xx0A">video</a>: <em>EHRI Project: Archival Integration with Neo4j</em></p> +</li> +</ul> +<p><strong>11:30 coffee</strong></p> +<ul> +<li> +<p>11:50 Thilo Muth (University of Magdeburg) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506369.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=5xH3UDLP6Oc">video</a>: <em>MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis</em></p> +</li> +<li> +<p>12:10 Davy Suvee (Janssen Pharmaceutica / Johnson &amp; Johnson) – <a href="https://www.youtube.com/watch?v=XN3LRJUfJIU">video</a>: <em>Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph</em></p> +</li> +<li> +<p>12:30 Yongming Luo (TU Eindhoven) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506366.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=g_my3tBB2_s">video</a>: <em>Regularities and dynamics in bisimulation reductions of big graphs</em></p> +</li> +<li> +<p>12:50 Christopher Davis (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506370.pdf">pdf</a>, <a href="https://www.youtube.com/channel/UC6HbzfJ4016Vez-2HKNeDag">video</a>: <em>Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues</em></p> +</li> +</ul> +<p><strong>13:10 - 14:30 lunch @ restaurant Polder</strong></p> +<ul> +<li> +<p>14:30 <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506365.pptx">SPB task force report</a></p> +</li> +<li> +<p>15:00 Bastiaan Bijl (Sysunite) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506373.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=TsCeKDHShMY">video</a>: <em>Using a semantic approach for monitoring applications in large engineering projects</em></p> +</li> +<li> +<p>15:20 Frans Knibbe (Geodan) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506372.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=uAX-m4OewPM">video</a>: <em>Benchmarks for geographical data</em></p> +</li> +<li> +<p>15:40 Armando Stellato (University of Rome, Tor Vergata &amp; UN Food and Agriculture Organization) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506374.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=mfA4csAs72Y">video</a>: <em>VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges</em></p> +</li> +</ul> +<p><strong>16:00 coffee</strong></p> +<ul> +<li> +<p>16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), <a href="https://www.youtube.com/watch?v=ZUDnVw9P_Rc">video</a>:<em>Customer experiences in implementing SKOS-based vocabularymanagement systems</em></p> +</li> +<li> +<p>16:40 Simon Jupp (European Bioinformatics Institute) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506368.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=CgTuOGK92W8">video</a>: <em>[Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]</em></p> +</li> +<li> +<p>17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506381.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=QTc3yOgoEsg">video</a>: <em>Breakmarking UniProt RDF. SPARQL queries that make your database cry&hellip;</em></p> +</li> +<li> +<p>17:20 Rein van &rsquo;t Veer (Digital Heritage Netherlands) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506380.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=2vDrZoskGyQ">video</a> <em>Time and space for heritage</em></p> +</li> +<li> +<p>17:40 <strong>end of meeting</strong></p> +</li> +<li> +<p>19:00 - 21:30 Social Dinner in restaurant Boom</p> +</li> +</ul> +<p><strong>April 4th</strong></p> +<p>LDBC plenary meeting for project partners.</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506362.ppt">Benchmarking Graph-Processing Platforms: A Vision</a> – Alexandru Iosup</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p>The meeting will be held at the Dutch national research institute for computer science and mathematics (<a href="http://www.cwi.nl">CWI</a> - Centrum voor Wiskunde en Informatica). It is located at <a href="http://www.amsterdamsciencepark.nl/">Amsterdam Science Park</a>:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505821.jpg" alt=""></p> +<p>(<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505820.pdf">A5 map</a>)</p> +<h6 id="travel">Travel</h6> +<p><strong>Arriving &amp; departing:</strong></p> +<p>Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, <a href="http://www.schiphol.com/">www.schiphol.nl</a>) that serves all main European carriers and also very many low-fare carriers.</p> +<p><a href="http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane">http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane</a></p> +<p><strong>Trains</strong> (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) &ndash; which station you are also likely arriving at in case of an international train trip.</p> +<p>From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 &ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).</p> +<p><strong>Taxi</strong> is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).</p> +<p><strong>Public transportation</strong> (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.</p> +<p><strong>Only the &ldquo;disposable&rdquo; cards are interesting for you as visitor.</strong></p> +<p>Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.</p> +<p><strong>Getting Around:</strong> the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.</p> +<p><strong>Cars</strong></p> +<p>In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the &ldquo;WCW&rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.</p> +<p><strong>Arriving at CWI:</strong> Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you&rsquo;ll receive a visitor&rsquo;s pass that allows you to enter our building.</p> +<p><strong>Social Dinner</strong></p> +<p>The social dinner will take place at 7pm on April 3 in Restaurant Boom (<a href="http://www.boometenendrinken.nl/">boometenendrinken.nl</a>), Linneausstraat 63, Amsterdam.</p> + + + + + Third TUC Meeting + https://ldbcouncil.org/event/third-tuc-meeting/ + Tue, 19 Nov 2013 08:00:00 +0000 + + https://ldbcouncil.org/event/third-tuc-meeting/ + <p>The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!</p> +<p>This will be a one day event in London on the <strong>19 November 2013</strong> running in collaboration with the <a href="http://www.graphconnect.com/london/">GraphConnect</a> event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: <strong>LDBCTUC</strong>.</p> +<p>The TUC event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology</li> +<li>Industry discussions on the contents of the benchmarks</li> +</ul> +<p>We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.</p> +<p>We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a></li> +<li><a href="#ldbctuc-background">LDBC/TUC Background</a> +<ul> +<li><a href="#social-network-benchmark">Social Network Benchmark</a></li> +<li><a href="#semantic-publishing-benchmark">Semantic Publishing Benchmark</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>November 19th - Public TUC Meeting</strong></p> +<p>8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)</p> +<p>short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)</p> +<p>NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.</p> +<p>10:00 TUC Meeting Opening (Peter Boncz)</p> +<p>10:10 TUC Presentations (RDF Application Descriptions)</p> +<ul> +<li>Johan Hjerling (BBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275669.pdf">BBC Linked Data and the Semantic Publishing Benchmark</a></strong></em></li> +<li>Andreas Both (Unister): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505027.pdf">Ontology-driven applications in an e-commerce context</a></strong></em></li> +<li>Nuno Carvalho (Fujitsu Laboratories Europe): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275666.pdf"><em><strong>Fujitsu RDF use cases and benchmarking requirements</strong></em></a></li> +<li>Robina Clayphan (Europeana): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816977.ppt">Europeana and Open Data</a></strong></em></li> +</ul> +<p>11:30 Semantic Publishing Benchmark (SPB)</p> +<ul> +<li>Venelin Kotsev (Ontotext - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">Semantic Publishing Benchmark Task Force Update</a></strong></em> and <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">report</a></strong></em></li> +</ul> +<p>12:00-13:00 Lunch at the Graph Connect venue</p> +<p><em>Talks During Lunch:</em></p> +<ul> +<li>Pedro Furtado, Jorge Bernardino (Univ. Coimbra): <strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275671.pdf">KEYSTONE Cost Action</a></strong></li> +</ul> +<p>13:00 TUC Presentations (Graph Application Descriptions)</p> +<ul> +<li>Minqi Zhou / Weining Qian (East China Normal University): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275670.pdf">Elastic and realistic social media data generation</a></strong></em></li> +<li>Andrew Sherlock (Shapespace): <em><strong>Shapespace Use Case</strong></em></li> +<li>Sebastian Verheughe (Telenor): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275667.pdf">Real-time Resource Authorization</a></strong></em></li> +</ul> +<p>14:00 Social Network Benchmark (SNB)</p> +<ul> +<li>Norbert Martinez (UPC - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505025.pdf">Social Network Benchmark Task Force Update</a></strong></em> and <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816975.pdf">Report</a></li> +</ul> +<p><em>14:30 Break</em></p> +<p>14:45 TUC Presentations (Graph Analytics)</p> +<ul> +<li>Keith Houck (IBM): <em><strong>Benchmarking experiences with [System G Native Store (tentative title)]</strong></em></li> +<li>Abraham Bernstein (University of Zurich): <em><strong>Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store</strong></em></li> +<li>Luis Ceze (University of Washington): <em><strong>Grappa and GraphBench Status Update</strong></em></li> +</ul> +<p><em>15:45 Break</em></p> +<p>16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*</p> +<ul> +<li>Christian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): <em><strong>CIDOC-CRM</strong></em></li> +<li>Atanas Kiryakov (Ontotext): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275672.pdf">Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM)</a></strong></em></li> +<li>Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275668.pdf">Geographica: A Benchmark for Geospatial RDF Stores</a></strong></em></li> +<li>Xavier Lopez (Oracle): <em><strong>W3C Property Graph progress</strong></em></li> +<li>Thomas Scharrenbach (University Zurich) <em><strong>PCKS: Benchmarking Semantic Flow Processing Systems</strong></em></li> +</ul> +<p>17:20 Meeting Conclusion (Josep Larriba Pey)</p> +<p>17:30 End of TUC meeting</p> +<p>19:00 Social dinner</p> +<p><strong>November 20th - Internal LDBC Meeting</strong></p> +<p>10:00 Start</p> +<p>12:30 <em>End of meeting</em></p> +<ul> +<li>coffee and lunch provided</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p><strong>Date</strong></p> +<p>19th November 2013</p> +<p><strong>Location</strong></p> +<p>The TUC meeting will be held in <strong>The Tower</strong> hotel (<a href="http://goo.gl/qZt8Fz">Google Maps link</a>) approximately 4 minutes walk from the <a href="http://www.graphconnect.com/london/">GraphConnect</a> conference in London.</p> +<p>Getting there</p> +<ul> +<li>From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.</li> +<li>From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554995.pdf">See attached</a>.</li> +</ul> +<h3 id="ldbctuc-background">LDBC/TUC Background</h3> +<p>Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SNB_Report_Nov2013.pdf</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SPB_Report_Nov2013.pdf</a></li> +</ul> +<p>A summary of these efforts can be read below or, for a more detailed account, please refer to: <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554967.pdf">The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort</a>. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.</p> +<h4 id="social-network-benchmark">Social Network Benchmark</h4> +<p>The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:</p> +<ul> +<li>it should be understandable, and the relevance of managing such data should be understandable</li> +<li>it should cover the complete range of interesting challenges, according to the benchmark scope</li> +<li>the queries should be realistic, i.e., similar data and workloads are encountered in practice</li> +</ul> +<p>SNB includes a data generator for creation of synthetic social network data with the following characteristics:</p> +<ul> +<li>data schema is representative of real social networks</li> +<li>data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions</li> +<li>the software generator is easy-to-use, configurable and scalable</li> +</ul> +<p>SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:</p> +<ul> +<li><strong>Interactive</strong> +<ul> +<li>Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.</li> +<li>The targeted systems are expected to be those that offer transactional functionality.</li> +</ul> +</li> +<li><strong>Business Intelligence</strong> +<ul> +<li>Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.</li> +<li>The targeted systems are expected to be those that offer an abstract query language.</li> +</ul> +</li> +<li><strong>Graph Analytics</strong> +<ul> +<li>Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.</li> +<li>Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.</li> +<li>The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.</li> +</ul> +</li> +</ul> +<h4 id="semantic-publishing-benchmark">Semantic Publishing Benchmark</h4> +<p>The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.</p> +<p>The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works &ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.</p> +<p>The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.</p> +<p>Two separate workloads are modeled in SPB:</p> +<ul> +<li><strong>Editorial:</strong> Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.</li> +<li><strong>Aggregation:</strong> Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as &ldquo;dynamic&rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.</li> +</ul> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505026.pdf">Status of the Semantic Publishing Benchmark</a></p> + + + + + Second TUC Meeting + https://ldbcouncil.org/event/second-tuc-meeting/ + Mon, 22 Apr 2013 10:00:00 +0000 + + https://ldbcouncil.org/event/second-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.</p> +<p>This will be a two day event in Munich on the <strong>22/23rd April 2013</strong>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +<li><a href="#venue">Venue</a> +<ul> +<li><a href="#getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</a></li> +<li><a href="#getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</a></li> +<li><a href="#getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</a></li> +</ul> +</li> +<li><a href="#getting-there">Getting there</a></li> +<li><a href="#social-dinner">Social Dinner</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>April 22nd</strong></p> +<p>10:00 <em>Registration.</em><br> +10:30 Josep Lluis Larriba Pey (UPC) - <em>Welcome and Introduction.</em><br> +10:30 Peter Boncz (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687373.pptx">LDBC: goals and status</a></p> +<p><em>Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)</em></p> +<p>11:00 Josep Lluis Larriba Pey (UPC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687372.pdf">Social Network Benchmark Task Force</a><br> +11:30 Gustavo González (Mediapro): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687367.pdf">Graph-based User Modeling through Real-time Social Streams</a><br> +12:00 Klaus Großmann (Dshini): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687365.pdf">Neo4j at Dshini</a></p> +<p>12:30 Lunch</p> +<p><em>Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)</em></p> +<p>13:30 Barry Bishop (Ontotext): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687366.pptx">Semantic Publishing Benchmark Task Force</a><br> +14:00 Dave Rogers (BBC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687364.pptx">Linked Data Platform at the BBC</a><br> +14:30 Edward Thomas (Wolters Kluwer): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687374.pdf">Semantic Publishing at Wolters Kluwer</a></p> +<p>15:00 Coffee break</p> +<p><em>Projects Related to LDBC</em></p> +<p>15:30 Fabian Suchanek (MPI): &ldquo;YAGO: A large knowledge base from Wikipedia and WordNet&rdquo;<br> +16:00 Antonis Loziou (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687375.pptx">The OpenPHACTS approach to data integration</a><br> +16:30 Mirko Kämpf (Brox): &ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case&rdquo;</p> +<p>17:00 <em>End of first day</em></p> +<p>19:00 Social dinner</p> +<p><strong>April 23rd</strong></p> +<p><em>Industry &amp; Hardware Aspects</em></p> +<p>10:00 Xavier Lopez (Oracle): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687384.pdf">Graph Database Performance an Oracle Perspective.pdf</a><br> +10:30 Pedro Trancoso (University of Cyprus): &ldquo;Benchmarking and computer architecture: the research side&rdquo;</p> +<p>11:00 Coffee break</p> +<p><em>Future Steps and TUC feedback session</em></p> +<p>11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force<br> +12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force&quot;</p> +<p>12:30 <em>End of meeting</em></p> +<h3 id="logistics">Logistics</h3> +<h4 id="date">Date</h4> +<p>22nd and 23th April 2013</p> +<h4 id="location">Location</h4> +<p>The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:</p> +<p>LRZ (Leibniz-Rechenzentrum)<br> +Boltzmannstraße 1<br> +85748 Garching, Germany</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi and Subway <a href="http://www.in.tum.de/fileadmin/user_upload/Sonstiges/anfahrt_garching.pdf">Ubahn</a></p> +<h5 id="getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</h5> +<p>Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.</p> +<h5 id="getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</h5> +<ol> +<li> +<p>(except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.</p> +</li> +<li> +<p>S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.</p> +</li> +<li> +<p>Taxi: fare is ca. 30-40 euros.</p> +</li> +</ol> +<p>For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.</p> +<h5 id="getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</h5> +<p>The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.</p> +<p><strong>Finding LRZ@TUM</strong></p> +<p><a href="http://www.openstreetmap.org/?mlat=48.2615702464&amp;mlon=11.6686558264&amp;zoom=32">OpenStreetMap link</a></p> +<p><a href="https://maps.google.com/maps?q=48.2615702464,11.6686558264&amp;spn=0.005,0.005&amp;t=k">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687268.gif" alt=""></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687269.gif" alt=""></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying: Munich</strong> airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.</p> +<p><strong>S-Bahn:</strong> S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.</p> +<p><strong>Taxi:</strong> taxi from the airport to the city center costs approximately 50 euros</p> +<h4 id="social-dinner">Social Dinner</h4> +<p>The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)</p> +<p>Address: Hofbräuhaus, Platzl 9, Munich</p> + + + + + First TUC Meeting + https://ldbcouncil.org/event/first-tuc-meeting/ + Mon, 19 Nov 2012 09:00:00 +0100 + + https://ldbcouncil.org/event/first-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the <strong>19/20th November 2012</strong>.</p> +<p>So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:</p> +<ul> +<li>Introduction by the coordinator and technical director explaining the objectives of the LDBC project</li> +<li>Invitation to users to explain their use-cases and describe the limitations they have found in current technology</li> +<li>Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points</li> +</ul> +<p>The exact agenda will be published here as things get finalised before the event.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#slide">Slide</a> +<ul> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +</ul> +</li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.</p> +<p><strong>Day 1</strong></p> +<p>09:00 Welcome (Location: Aula Master)<br> +09:30 Project overview (Emphasis on task forces?) + Questionnaire results?<br> +10:30 Coffee break<br> +11:00 User talks (To gather information for use cases?)</p> +<p>13:00 Lunch</p> +<p>14:00 User talks (cont.)<br> +15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).<br> +16:00 Task force proposals (consortium)<br> +17:00 Finish first day</p> +<p>20:00 Social dinner</p> +<p><strong>Day 2</strong></p> +<p>10:00 Task force discussion (consortium + TUC)<br> +11:00 Coffe break<br> +11:30 Task force discussion (consortium + TUC)<br> +12:30 Summaries (Task forces, use cases, &hellip;) and actions</p> +<p>13:00 Lunch and farewell</p> +<p>15:00 LDBC Internal meeting</p> +<h3 id="slide">Slide</h3> +<p>Opening session:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686995.pptx">CWI – Peter Boncz</a> – Objectives</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687001.pdf">UPC – Larri</a> – Questionnaire</li> +</ul> +<p>User stories:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686998.pdf">BBC – Jem Rayfield</a></li> +<li>CA Technologies – Victor Muntés</li> +<li>Connected Discovery (Open Phacts) – Bryn Williams-Jones</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687003.pptx">Elsevier – Alan Yagoda</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687000.pptx">ERA7 Bioinformatics – Eduardo Pareja</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687005.pptx">Press Association – Jarred McGinnis</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687004.pptx">RJLee – David Neuer</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686994.pdf">Yale – Lec Maj</a></li> +</ul> +<p>Benchmark proposals:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686991.pdf">Publishing benchmark proposal – Ontotext – Barry Bishop</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687002.pdf">Social Network Benchmark Proposal – UPC – Larri</a></li> +</ul> +<h4 id="logistics">Logistics</h4> +<h5 id="date">Date</h5> +<p>19th and 20th November 2012</p> +<h5 id="location">Location</h5> +<p>The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<p><strong>Finding UPC</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<p><strong>Finding the meeting room</strong></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<p><strong>The locations of the airport and the city centre</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933317.jpg" alt=""></p> +<p><strong>Bus map</strong></p> + + + + + \ No newline at end of file diff --git a/industry/members/index.html b/industry/members/index.html new file mode 100644 index 00000000..6e861009 --- /dev/null +++ b/industry/members/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/organizational-members/ + + + + + + diff --git a/introduction/index.html b/introduction/index.html new file mode 100644 index 00000000..42c472bf --- /dev/null +++ b/introduction/index.html @@ -0,0 +1,355 @@ + + + + + Introduction + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Introduction

+ + + + +
+
+
+
+ + + + + +
+
+
+
+
+

The Linked Data Benchmark Council (LDBC) is a non-profit organization aiming to define standard graph benchmarks to foster a community around graph processing technologies. LDBC consists of members from both industry and academia, including organizations and individuals.

+

An overview of our activites is summarized in a lightning talk at FOSDEM 2023’s HPC room (9 minutes):

+ +

See also our TPCTC 2023 paper and its slide deck.

+

Contact

+

To learn more about LDBC, reach out at info@ldbcouncil.org.

+

Post address

+

First Floor, Two Chamberlain Square
+Birmingham
+B3 3AX
+United Kingdom

+ +
+
+ +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/index.html b/jan-2-2006/index.html new file mode 100644 index 00000000..5e3e0e14 --- /dev/null +++ b/jan-2-2006/index.html @@ -0,0 +1,765 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+ +
+ +

We are delighted to announce the official release of the initial version (v0.1.0) of Financial Benchmark (FinBench).

+

The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the LDBC FinBench Task Force. The benchmark has one workload currently, Transaction Workload, capturing OLTP scenario with complex read queries that access the …

+ +
+
+ +
+ + +
+
+
+ +

Posts

+
Tags:
+ +
+
+ + + +
+
+ +
+ + +
+
+
+ +

LDBC SNB – Early 2023 updates

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

2023 has been an eventful year for us so far. Here is a summary of our recent activities.

+
    +
  1. +

    Our paper The LDBC Social Network Benchmark: Business Intelligence Workload was published in PVLDB.

    +
  2. +
  3. +

    David Püroja just completed his MSc thesis on creating a design towards SNB Interactive v2 at CWI’s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference’s graph developer room titled The LDBC Social Network …

+ +
+
+ +
+ + +
+
+
+ +

LDBC SNB Datagen – The winding path to SF100K

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my last technical update on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the …

+ +
+
+ +
+ + +
+
+ +
+ +

We are delighted to announce the set up of the Financial Benchmark (FinBench) task force.

+

The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the …

+ +
+
+ +
+ + +
+
+
+ +

Speeding Up LDBC SNB Datagen

+
Tags:
+ DATAGEN + , SNB + +
+
+ +

LDBC’s Social Network Benchmark [4] (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems’ bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/index.xml b/jan-2-2006/index.xml new file mode 100644 index 00000000..fae2e743 --- /dev/null +++ b/jan-2-2006/index.xml @@ -0,0 +1,5297 @@ + + + + Jan 2, 2006 on Linked Data Benchmark Council + https://ldbcouncil.org/jan-2-2006/ + Recent content in Jan 2, 2006 on Linked Data Benchmark Council + Hugo -- gohugo.io + en-us + &copy; Copyright LDBC 2024 + + Announcing the Official Release of LDBC Financial Benchmark v0.1.0 + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + Tue, 27 Jun 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-official-release-of-ldbc-financial-benchmark/ + <p>We are delighted to announce the official release of the initial version (v0.1.0) of <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench)</a>.</p> +<p>The Financial Benchmark (FinBench) project defines a graph database benchmark targeting financial scenarios such as anti-fraud and risk control. It is maintained by the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">LDBC FinBench Task Force</a>. The benchmark has one workload currently, <strong>Transaction Workload</strong>, capturing OLTP scenario with complex read queries that access the neighbourhood of a given node in the graph and write queries that continuously insert or delete data in the graph.</p> +<p>Compared to LDBC SNB, the FinBench differs in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. For a brief overview, see the <a href="https://ldbcouncil.org/benchmarks/finbench/finbench-talk-16th-tuc.pdf">slides</a> in the 16th TUC. The <a href="https://arxiv.org/pdf/2306.15975.pdf">Financial Benchmark&rsquo;s specification</a> can be found on arXiv.</p> +<p>The release of FinBench initial version (v0.1.0) was approved by LDBC on June 23, 2022. It is the good beginning of FinBench. In the future, the FinBench Task Force will polish the benchmark continuously.</p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or qishipeng.qsp at antgroup.com.</p> + + + + + Sixteenth TUC Meeting + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + Fri, 23 Jun 2023 09:00:00 -0800 + + https://ldbcouncil.org/event/sixteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Oskar van Rest, Alastair Green, Gábor Szárnyas</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2023.sigmod.org/venue.shtml">SIGMOD 2023</a> on <strong>June 23-24 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10- and 15-minute talks followed by a Q&amp;A session. The talks will be recorded and made available online. <strong>If you would like to participate please register using <a href="https://forms.gle/T6bwVHzK9V5FaKyR9">our form</a>.</strong></p> +<p>LDBC will host a <strong>social event</strong> on Friday at the <a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a>.</p> +<p>In addition, AWS will host a <strong>Happy Hour</strong> (rooftop grill with beverages) on Saturday on the Amazon Nitro South building&rsquo;s 8th floor deck: <a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>.</p> +<h3 id="program">Program</h3> +<p><strong>All times are in PDT.</strong></p> +<h4 id="friday">Friday</h4> +<p><strong>Location:</strong> Hyatt Regency Bellevue on Seattle&rsquo;s Eastside, <strong>room Grand K</strong>, co-located with SIGMOD (<a href="https://www.hyatt.com/en-US/hotel/washington/hyatt-regency-bellevue-on-seattles-eastside/belle">900 Bellevue Way NE, Bellevue, WA 98004-4272</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>08:30</td> +<td>08:45</td> +<td>Oskar van Rest (Oracle)</td> +<td>LDBC – State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-ldbc-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/Frk7ITssaSY">video</a></td> +</tr> +<tr> +<td>08:50</td> +<td>09:05</td> +<td>Keith Hare (JCC / WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/keith-hare-an-update-on-the-gql-and-sql-pgq-standards-efforts.pdf">slides</a>, <a href="https://youtu.be/LQYkal_0j6E">video</a></td> +</tr> +<tr> +<td>09:10</td> +<td>09:25</td> +<td>Stefan Plantikow (Neo4j / WG3)</td> +<td>GQL - Introduction to a new query language standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/stefan-plantikow-gql-v1.pdf">slides</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Leonid Libkin (University of Edinburgh &amp; RelationalAI)</td> +<td>Formalizing GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/leonid-libkin-formalizing-gql.pdf">slides</a>, <a href="https://youtu.be/YZE1a00h1I4">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Semen Panenkov (JetBrains Research)</td> +<td>Mechanizing the GQL semantics in Coq – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/semyon-panenkov-gql-in-coq.pdf">slides</a>, <a href="https://youtu.be/5xBGohqWCzo">videos</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Oskar van Rest (Oracle)</td> +<td>SQL Property Graphs in Oracle Database and Oracle Graph Server (PGX) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/oskar-van-rest-sql-property-graphs-in-oracle-database-and-oracle-graph-server-pgx.pdf">slides</a>, <a href="https://youtu.be/owM9WiQubpg">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (JCC)</td> +<td>LDBC&rsquo;s organizational changes and fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-ldbc-corporate-restructuring-and-fair-use-policies.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>Ioana Manolescu (INRIA)</td> +<td>Integrating Connection Search in Graph Queries – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ioana-manolescu-integrating-connection-search-in-graph-queries.pdf">slides</a>, <a href="https://youtu.be/LQPnmcrkUpY">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Maciej Besta (ETH Zurich)</td> +<td>Neural Graph Databases with Graph Neural Networks – <a href="https://youtu.be/ce5qNievRNs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:10</td> +<td>Longbin Lai (Alibaba Damo Academy)</td> +<td>To Revisit Benchmarking Graph Analytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/longbin-lai-benchmark-ldbc.pdf">slides</a>, <a href="https://youtu.be/s9Vtt-6t_FI">video</a></td> +</tr> +<tr> +<td>12:15</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>The World of Graph Databases from An Industry Perspective – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/yuanyuan-tian-world-of-graph-databases.pdf">slides</a>, <a href="https://youtu.be/AZuP_b95GPM">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Alin Deutsch (UC San Diego &amp; TigerGraph)</td> +<td>TigerGraph&rsquo;s Parallel Computation Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alin-deutsch-tigergraphs-computation-model.pdf">slides</a>, <a href="https://youtu.be/vcxdieJB80Y">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Chen Zhang (CreateLink)</td> +<td>Applications of a Native Distributed Graph Database in the Financial Industry – <a href="https://youtu.be/GCCT79Sps9I">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Ricky Sun (Ultipa)</td> +<td>Design of highly scalable graph database systems – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ricky-sun-ultipa.pdf">slides</a>, <a href="https://youtu.be/Sg1F64O4vGM">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Heng Lin (Ant Group)</td> +<td>The LDBC SNB implementation in TuGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-the-ldbc-snb-implementation-in-tugraph.pdf">slides</a>, <a href="https://youtu.be/fy8AuVerwnY">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>Shipeng Qi (Ant Group)</td> +<td>FinBench: The new LDBC benchmark targeting financial scenario – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/shipeng-qi-finbench.pdf">slides</a>, <a href="https://youtu.be/0xLZadDOfZk">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>17:00</td> +<td>host: Heng Lin (Ant Group), panelists: Longbin Lai (Alibaba Damo Academy), Ricky Sun (Ultipa), Gabor Szarnyas (CWI), Yuanyuan Tian (Gray Systems Lab, Microsoft)</td> +<td>FinBench panel – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/heng-lin-finbench-panel.pdf">slides</a></td> +</tr> +<tr> +<td>19:00</td> +<td>22:00</td> +<td><em>dinner</em></td> +<td><em><a href="https://www.blackbottleseattle.com/">Black Bottle gastrotavern</a> in Belltown: <a href="https://goo.gl/maps/hQzBRR2nerZEQExw7">2600 1st Ave (on the corner of Vine), Seattle, WA 98121</a></em></td> +</tr> +</tbody> +</table> +<h4 id="saturday">Saturday</h4> +<p><strong>Location:</strong> Amazon Nitro South building, <strong>room 03.204</strong> (<a href="https://goo.gl/maps/md5kWUHaNUGhR9JB7">2205 8th Ave, Seattle, WA 98121</a>)</p> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:00</td> +<td>09:45</td> +<td>Brad Bebee (AWS)</td> +<td>Customers don&rsquo;t want a graph database, so why are we still here? – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/brad-bebee-tuc-keynote.pdf">slides</a>, <a href="https://youtu.be/bJlkpDC--fM">video</a></td> +</tr> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Fast and Efficient Update Handling for Graph H2TAP – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/muhammad-attahir-jibril-fast-and-efficient-update-handling-for-graph-h2tap.pdf">slides</a>, <a href="https://youtu.be/e8ZAszBsXV0">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Gabor Szarnyas (CWI)</td> +<td>LDBC Social Network Benchmark and Graphalytics – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-social-network-benchmark-and-graphalytics.pdf">slides</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:30</td> +<td>Atanas Kiryakov and Tomas Kovachev (Ontotext)</td> +<td>GraphDB – Benchmarking against LDBC SNB &amp; SPB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tomas-kovatchev-atanas-kiryakov-benchmarking-graphdb-with-snb-and-spb.pdf">slides</a>, <a href="https://youtu.be/U6OPpNFOWqg">video</a></td> +</tr> +<tr> +<td>11:35</td> +<td>11:50</td> +<td>Roi Lipman (Redis Labs)</td> +<td>Delta sparse matrices within RedisGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/roi-lipman-delta-matrix.pdf">slides</a>, <a href="https://youtu.be/qfKsplV4Ihk">video</a></td> +</tr> +<tr> +<td>11:55</td> +<td>12:05</td> +<td>Rathijit Sen (Microsoft)</td> +<td>Microarchitectural Analysis of Graph BI Queries on RDBMS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/rathijit-sen-microarchitectural-analysis.pdf">slides</a>, <a href="https://youtu.be/55B8CkH09js">video</a></td> +</tr> +<tr> +<td>12:10</td> +<td>13:30</td> +<td><em>lunch</em></td> +<td><em>on your own</em></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Alastair Green (JCC)</td> +<td>LEX &ndash; LDBC Extended GQL Schema – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/alastair-green-lex.pdf">slides</a>, <a href="https://youtu.be/DVpeb4Ce9Uw">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Ora Lassila (AWS)</td> +<td>Why limit yourself to {RDF, LPG} when you can do {RDF, LPG}, too – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/ora-lassila-why-limit-yourself-to-lpg-when-you-can-do-rdf-too.pdf">slides</a>, <a href="https://youtu.be/7uAInoUwdds">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Jan Hidders (Birkbeck, University of London)</td> +<td>PG-Schema: a proposal for a schema language for property graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/jan-hidders-pg-schema.pdf">slides</a>, <a href="https://youtu.be/yQNL8hBTE4M">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Max de Marzi (RageDB and RelationalAI)</td> +<td>RageDB: Building a Graph Database in Anger – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/max-de-marzi-ragedb-building-a-graph-database-in-anger.pdf">slides</a>, <a href="https://youtu.be/LBbF8aslYFE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:30</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>15:30</td> +<td>15:45</td> +<td>Umit Catalyurek (AWS)</td> +<td>HPC Graph Analytics on the OneGraph Model – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/umit-catalyurek-onegraph-hpc.pdf">slides</a>, <a href="https://youtu.be/64tv5LA6Wr8">video</a></td> +</tr> +<tr> +<td>15:50</td> +<td>16:05</td> +<td>David J. Haglin (Trovares)</td> +<td>How LDBC impacts Trovares – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/david-haglin-trovares.pdf">slides</a>, <a href="">video</a></td> +</tr> +<tr> +<td>16:10</td> +<td>16:25</td> +<td>Wenyuan Yu (Alibaba Damo Academy)</td> +<td>GraphScope Flex: A Graph Computing Stack with LEGO-Like Modularity – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/wenyuan-yu-graphscope-flex.pdf">slides</a>, <a href="https://youtu.be/cRikoyDmMks">video</a></td> +</tr> +<tr> +<td>16:30</td> +<td>16:40</td> +<td>Scott McMillan (Carnegie Mellon University)</td> +<td>Graph processing using GraphBLAS – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/scott-mcmillan-graph-processing-using-graphblas.pdf">slides</a>, <a href="https://youtu.be/yb4hGBhUzQQ">video</a></td> +</tr> +<tr> +<td>16:45</td> +<td>16:55</td> +<td>Tim Mattson (Intel)</td> +<td>Graphs (GraphBLAS) and storage (TileDB) as Sparse Linear algebra – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixteenth-tuc-meeting/attachments/tim-mattson-graphblas-and-tiledb.pdf">slides</a></td> +</tr> +<tr> +<td>17:00</td> +<td>20:00</td> +<td><em>happy hour (rooftop grill with beverages)</em></td> +<td><em>on the Nitro South building&rsquo;s 8th floor deck</em></td> +</tr> +</tbody> +</table> +<h4 id="tuc-event-locations">TUC event locations</h4> +<p>A <a href="https://www.google.com/maps/d/u/0/edit?mid=19_fi4fV-3-PZkNWCCcmhU86ct2EZXbgo">map of the LDBC TUC events</a> we hosted so far.</p> + + + + + LDBC SNB – Early 2023 updates + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + Wed, 15 Feb 2023 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-early-2023-updates/ + <p>2023 has been an eventful year for us so far. Here is a summary of our recent activities.</p> +<ol> +<li> +<p>Our paper <a href="https://ldbcouncil.org/docs/papers/ldbc-snb-bi-vldb-2022.pdf">The LDBC Social Network Benchmark: Business Intelligence Workload</a> was published in PVLDB.</p> +</li> +<li> +<p>David Püroja just completed his MSc thesis on creating a design towards <a href="https://ldbcouncil.org/docs/papers/msc-thesis-david-puroja-snb-interactive-v2-2023.pdf">SNB Interactive v2</a> at CWI&rsquo;s Database Architectures group. David and I gave a deep-dive talk at the FOSDEM conference&rsquo;s graph developer room titled <a href="https://fosdem.org/2023/schedule/event/graph_ldbc/">The LDBC Social Network Benchmark</a> (<a href="https://www.youtube.com/watch?v=YNF6z6gtXY4">YouTube mirror</a>).</p> +</li> +<li> +<p>I gave a lightning talk at FOSDEM&rsquo;s HPC developer room titled <a href="https://www.youtube.com/watch?v=q26DHnQFw54">The LDBC Benchmark Suite</a> (<a href="https://www.youtube.com/watch?v=q26DHnQFw54">YouTube mirror</a>).</p> +</li> +<li> +<p>Our auditors have successfully benchmark a number of systems:</p> +<ul> +<li>SPB with the Ontotext GraphDB systems for the SF3 and SF5 data sets (auditor: Pjotr Scholtze)</li> +<li>SNB Interactive with the Ontotext GraphDB system for the SF30 data set (auditor: David Püroja)</li> +<li>SNB Interactive with the TuGraph system running in the Aliyun cloud for the SF30, SF100, and SF300 data sets (auditor: Márton Búr)</li> +</ul> +</li> +</ol> +<p>The results and the full disclosure reports are available under the <a href="https://ldbcouncil.org/benchmarks/spb/">SPB</a> and <a href="https://ldbcouncil.org/benchmarks/snb/">SNB benchmark pages</a>.</p> + + + + + LDBC SNB Datagen – The winding path to SF100K + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + Tue, 13 Sep 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-snb-datagen-the-winding-path-to-sf100k/ + <p>LDBC SNB provides a data generator, which produces synthetic datasets, mimicking a social network’s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. More than two years have elapsed since my <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">last technical update</a> on LDBC SNB Datagen, in which I discussed the reasons for moving the code to Apache Spark from the MapReduce-based Apache Hadoop implementation and the challenges I faced during the migration. Since then, we reached several goals such as we refactored the serializers to use Spark&rsquo;s high-level writers to support the popular Parquet data format and to enable running on spot nodes; brought back factor generation; implemented support for the novel BI benchmark; and optimized the runtime to generate SF30K on 20 i3.4xlarge machines on AWS.</p> +<h1 id="moving-to-sparksql">Moving to SparkSQL</h1> +<p>We planned to move parts of the code to SparkSQL, an optimized runtime framework for tabular data. We hypothesized that this would benefit us on multiple fronts: SparkSQL offers an efficient batch analytics runtime, with higher level abstractions that are simpler to understand and work with, and we could easily add support for serializing to Parquet based on SparkSQL&rsquo;s capabilites.</p> +<blockquote> +<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as a distributed SQL query engine. Spark SQL includes a cost-based optimizer, columnar storage, and code generation to make queries fast.</p> +</blockquote> +<p>Dealing with the dataset generator proved quite tricky, because it samples from various hand-written distributions and dictionaries, and contains complex domain logic, for which SparkSQL unsuitable. We assessed that the best thing we could do is wrap entire entity generation procedures in UDFs (user defined SQL functions). However, several of these generators return entity trees<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup>, which are spread across multiple tables by the serializer, and these would have needed to be split up. Further complicating matters, we would have also had to find a way to coordinate the inner random generators&rsquo; state between the UDFs to ensure deterministic execution. Weighing these and that we could not find much benefit in SparkSQL, we ultimately decided to leave entity generation as it is. We limited the SparkSQL refactor to the following areas:</p> +<ol> +<li>table manipulations related to shaping the output into the supported layouts and data types as set forth in the specification;</li> +<li>deriving the Interactive and BI datasets;</li> +<li>and generating the factor tables, which contain analytic information, such as population per country, number of friendships between city pairs, number of messages per day, etc., used by the substitution parameter generator to ensure predictable query runtimes.</li> +</ol> +<p>We refer to points (1.) and (2.) collectively as dataset transformation, while (3.) as factor generation. Initially, these had been part of the generator, extracted as part of this refactor, which resulted in cleaner, more maintainable design.</p> +<p><img src="datagen_df_0.png" alt="Datagen stages"></p> +<p>The diagram above shows the components on a high level. The generator outputs a dataset called IR (intermediate representation), which is immediately written to disk. Then, the IR is input to the dataset transformation and factor generation stages, which respectively generate the final dataset and the factor tables. We are aware that spitting out the IR adds considerable runtime overhead and doubles the disk requirements in the worst-case scenario, however, we found that there&rsquo;s no simple way to avoid<br> +it, as the generator produces entity trees, which are incompatible with the flat, tabular, column oriented layout of SparkSQL. On the positive side, this design enables us to reuse the generator output for multiple transformations and add new factor tables without regenerating the data.</p> +<p>I&rsquo;ll skip describing the social network graph dataset generator (i.e. stage 1) in any more detail, apart from its serializer, as that was the only part involved in the current refactor. If you are interested in more details, you may look up the <a href="https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/">previous blogpost in the series</a> or the <a href="https://arxiv.org/abs/2001.02299">Interactive benchmark specification</a>.</p> +<h1 id="transformation-pipeline">Transformation pipeline</h1> +<p>The dataset transformation stage sets off where generation finished, and applies an array of pluggable transformations:</p> +<ul> +<li>explodes edges and / or attributes into separate tables,</li> +<li>subsets the snapshot part and creates insert / delete batches for the BI workload,</li> +<li>subsets the snapshot part for the Interactive workload,</li> +<li>applies formatting related options such as date time representation,</li> +<li>serializes the data to a Spark supported format (CSV, Parquet),</li> +</ul> +<p>We utilize a flexible data pipeline that operates on the graph.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span>, <span style="color:#66d9ef">M2</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">]</span> <span style="color:#a6e22e">extends</span> <span style="color:#f92672">(</span><span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">])</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">In</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> apply<span style="color:#f92672">(</span>v<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M1</span><span style="color:#f92672">])</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M2</span><span style="color:#f92672">]</span> <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>v<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>The <code>Transform</code> trait encodes a pure (side effect-free) function polymorphic over graphs, so that transformation pipelines can be expressed with ordinary function composition in a type safe manner. Let&rsquo;s see some of the transformations we have.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToBiTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">BI</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> keepImplicitDeletes<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.BI</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">,</span> simulationStart<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">,</span> simulationEnd<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Long</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Interactive</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeEdges</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">???</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Therefore, a transformation pipeline may look like this:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">RawToInteractiveTransform</span><span style="color:#f92672">(</span>params<span style="color:#f92672">,</span> start<span style="color:#f92672">,</span> end<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>inputGraph<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>The <code>Graph</code> record has a <code>definition</code> field containing graph-global metadata, whereas <code>entities</code> holds the datasets keyed by their entity type. There are 3 graph <em>modes</em> currently: <code>Raw</code>, <code>Interactive</code> and <code>BI</code>. The BI dataset has different layout than the rest, as it contains incremental inserts and deletes for the entities additionally to the bulk snapshot. This is captured in the <code>Layout</code> dependent type, over which the entities are polymorphic.</p> +<p>It&rsquo;s important to understand that <code>Graph</code> holds <code>DataFrame</code>s, and these are lazily computed by Spark. So, <code>Graph</code> is merely a description of transformations used to derive the comprising datasets, which makes them subject to all the SparkSQL fanciness such as query optimization, whole stage code generation, and so on. Processing is delayed until an action (such as a disk write) forces it.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> isAttrExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> isEdgesExploded<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> useTimestamp<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> mode<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">M</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">Option</span><span style="color:#f92672">[</span><span style="color:#66d9ef">String</span><span style="color:#f92672">]]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">+M</span> <span style="color:#66d9ef">&lt;:</span> <span style="color:#66d9ef">Mode</span><span style="color:#f92672">](</span> +</span></span><span style="display:flex;"><span> definition<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">GraphDef</span><span style="color:#f92672">[</span><span style="color:#66d9ef">M</span><span style="color:#f92672">],</span> +</span></span><span style="display:flex;"><span> entities<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Map</span><span style="color:#f92672">[</span><span style="color:#66d9ef">EntityType</span>, <span style="color:#66d9ef">M</span><span style="color:#66d9ef">#</span><span style="color:#66d9ef">Layout</span><span style="color:#f92672">]</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">sealed</span> <span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">object</span> <span style="color:#a6e22e">Raw</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">Interactive</span><span style="color:#f92672">(</span>bulkLoadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">DataFrame</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">final</span> <span style="color:#66d9ef">case</span> <span style="color:#66d9ef">class</span> <span style="color:#a6e22e">BI</span><span style="color:#f92672">(</span>bulkloadPortion<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Double</span><span style="color:#f92672">,</span> batchPeriod<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">String</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Mode</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Layout</span> <span style="color:#f92672">=</span> <span style="color:#a6e22e">BatchedEntity</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>You may notice that <code>Transform</code> is statically typed w.r.t. <code>Mode</code>, however other properties, like <code>isAttrExploded</code>, or <code>isEdgesExploded</code> are not captured in the type, and remain merely dynamic. This makes some nonsensical transformation pipelines (i.e. that explodes edges twice in a row) syntactically valid. This trade-off in compile-time safety was made to prevent overcomplicating the types.</p> +<p>As we already mentioned, <code>Graph</code> is essentially a persistent container of <code>EntityType -&gt; DataFrame</code> mappings. <code>EntityType</code> can be <code>Node</code>, <code>Edge</code> and <code>Attr</code>, and is used to identify the entity and embellish with static metadata, such a descriptive name and primary key, whether it is static or dynamic (as per the specification), and in case of edges, the source and destination type and cardinality. This makes it very simple to create transformation rules on static entity properties with pattern matching.</p> +<p>Usually, a graph transformation involves matching entities based on their <code>EntityType</code>, and modifying the mapping (and if required, other metadata). Take, for example, the <code>ExplodeAttrs</code> transformation, which explodes into separate tables the values of two columns of <code>Person</code> stored as arrays:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">object</span> <span style="color:#a6e22e">ExplodeAttrs</span> <span style="color:#66d9ef">extends</span> <span style="color:#a6e22e">Transform</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span>, <span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">override</span> <span style="color:#66d9ef">def</span> transform<span style="color:#f92672">(</span>input<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">In</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Out</span> <span style="color:#f92672">=</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">if</span> <span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#75715e">// assert at runtime that the transformation hasn&#39;t been applied yet +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#66d9ef">throw</span> <span style="color:#66d9ef">new</span> <span style="color:#a6e22e">AssertionError</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Attributes already exploded in the input graph&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> explodedAttr<span style="color:#f92672">(</span>attr<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Attr</span><span style="color:#f92672">,</span> node<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">DataFrame</span><span style="color:#f92672">,</span> column<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Column</span><span style="color:#f92672">)</span> <span style="color:#66d9ef">=</span> +</span></span><span style="display:flex;"><span> attr <span style="color:#f92672">-&gt;</span> node<span style="color:#f92672">.</span>select<span style="color:#f92672">(</span>withRawColumns<span style="color:#f92672">(</span>attr<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>parent<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">),</span> explode<span style="color:#f92672">(</span>split<span style="color:#f92672">(</span>column<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;;&#34;</span><span style="color:#f92672">)).</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">s&#34;</span><span style="color:#e6db74">${</span>attr<span style="color:#f92672">.</span>attribute<span style="color:#e6db74">}</span><span style="color:#e6db74">Id&#34;</span><span style="color:#f92672">)))</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> modifiedEntities <span style="color:#66d9ef">=</span> input<span style="color:#f92672">.</span>entities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>collect <span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k <span style="color:#66d9ef">@</span> <span style="color:#a6e22e">Node</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person&#34;</span><span style="color:#f92672">,</span> <span style="color:#66d9ef">false</span><span style="color:#f92672">),</span> df<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> <span style="color:#75715e">// match the Person node. This is the only one ExplodeAttrs should modify +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#a6e22e">Map</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Email&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;EmailAddress&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonEmailEmailAddress&#34; entity derived by exploding the email column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> explodedAttr<span style="color:#f92672">(</span><span style="color:#a6e22e">Attr</span><span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Speaks&#34;</span><span style="color:#f92672">,</span> k<span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;Language&#34;</span><span style="color:#f92672">),</span> df<span style="color:#f92672">,</span> $<span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">),</span> <span style="color:#75715e">// add a new &#34;PersonSpeaksLanguage&#34; entity derived by exploding the language column of Person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> k <span style="color:#f92672">-&gt;</span> df<span style="color:#f92672">.</span>drop<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;email&#34;</span><span style="color:#f92672">,</span> <span style="color:#e6db74">&#34;language&#34;</span><span style="color:#f92672">)</span> <span style="color:#75715e">// drop the exploded columns from person +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntities <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>entities<span style="color:#f92672">)(</span><span style="color:#66d9ef">_</span> <span style="color:#f92672">++</span> <span style="color:#66d9ef">_</span><span style="color:#f92672">)</span> <span style="color:#75715e">// merge-replace the modified entities in the graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> updatedEntityDefinitions <span style="color:#66d9ef">=</span> modifiedEntities +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>foldLeft<span style="color:#f92672">(</span>input<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities<span style="color:#f92672">)</span> <span style="color:#f92672">{</span> <span style="color:#f92672">(</span>e<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> e <span style="color:#f92672">++</span> v<span style="color:#f92672">.</span>map<span style="color:#f92672">{</span> <span style="color:#66d9ef">case</span> <span style="color:#f92672">(</span>k<span style="color:#f92672">,</span> v<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> k <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Some</span><span style="color:#f92672">(</span>v<span style="color:#f92672">.</span>schema<span style="color:#f92672">.</span>toDDL<span style="color:#f92672">)</span> <span style="color:#f92672">}</span> <span style="color:#75715e">// update the entity definition schema to reflect the modifications +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> l <span style="color:#66d9ef">=</span> lens<span style="color:#f92672">[</span><span style="color:#66d9ef">In</span><span style="color:#f92672">]</span> <span style="color:#75715e">// lenses provide a terse syntax for modifying nested fields +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">(</span>l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>isAttrExploded <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>definition<span style="color:#f92672">.</span>entities <span style="color:#f92672">~</span> l<span style="color:#f92672">.</span>entities<span style="color:#f92672">).</span>set<span style="color:#f92672">(</span>input<span style="color:#f92672">)((</span><span style="color:#66d9ef">true</span><span style="color:#f92672">,</span> updatedEntityDefinitions<span style="color:#f92672">,</span> updatedEntities<span style="color:#f92672">))</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">}</span> +</span></span></code></pre></div><p>Note that <code>EntityType</code> does not hold the dataset&rsquo;s full SQL schema currently, as it&rsquo;s not useful for pattern matching, but can be accessed directly from <code>DataFrame</code> if needed.</p> +<h1 id="inputoutput">Input/output</h1> +<p>The <code>Reader</code> and <code>Writer</code> typeclasses are used to read from a <code>Source</code> and write to a <code>Sink</code> respectively, terminating a graph transformation pipeline<br> +on both ends.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">T</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> read<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Ret</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> exists<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">T</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Boolean</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">trait</span> <span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">S</span><span style="color:#f92672">]</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">type</span> <span style="color:#66d9ef">Data</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">def</span> write<span style="color:#f92672">(</span>self<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Data</span><span style="color:#f92672">,</span> sink<span style="color:#66d9ef">:</span> <span style="color:#66d9ef">S</span><span style="color:#f92672">)</span><span style="color:#66d9ef">:</span> <span style="color:#66d9ef">Unit</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">}</span> +</span></span></code></pre></div><p>There are implementations under <code>ldbc.datagen.io.instances</code> that read a graph from a <code>GraphSource</code> and write to a <code>GraphSink</code>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> source <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Reader</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSource</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>read<span style="color:#f92672">(</span>source<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transform <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">ExplodeAttrs</span><span style="color:#f92672">.</span>andThen<span style="color:#f92672">(</span><span style="color:#a6e22e">ExplodeEdges</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> transform<span style="color:#f92672">(</span>graph<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> sink <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">Writer</span><span style="color:#f92672">[</span><span style="color:#66d9ef">GraphSink</span>, <span style="color:#66d9ef">Graph</span><span style="color:#f92672">[</span><span style="color:#66d9ef">Mode.Raw.</span><span style="color:#66d9ef">type</span><span style="color:#f92672">]].</span>write<span style="color:#f92672">(</span>transformedGraph<span style="color:#f92672">,</span> sink<span style="color:#f92672">)</span> +</span></span></code></pre></div><p>We provide <a href="https://github.com/typelevel/simulacrum">Ops syntax</a> to make it shorter:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.model.Mode +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.graphs.<span style="color:#f92672">{</span><span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">}</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.instances._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Reader.ops._ +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">import</span> ldbc.snb.datagen.io.Writer.ops._ +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// read +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> inputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/input/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> inputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;parquet&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> graph <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">GraphSource</span><span style="color:#f92672">(</span>model<span style="color:#f92672">.</span>graphs<span style="color:#f92672">.</span><span style="color:#a6e22e">Raw</span><span style="color:#f92672">.</span>graphDef<span style="color:#f92672">,</span> inputPath<span style="color:#f92672">,</span> inputFormat<span style="color:#f92672">).</span>read +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// transform +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> transformedGraph <span style="color:#66d9ef">=</span> <span style="color:#f92672">???</span> <span style="color:#75715e">/* ... */</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// write +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span><span style="color:#66d9ef">val</span> outputPath <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;path/to/output/graph&#34;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">val</span> outputFormat <span style="color:#66d9ef">=</span> <span style="color:#e6db74">&#34;csv&#34;</span> +</span></span><span style="display:flex;"><span>transformedGraph<span style="color:#f92672">.</span>write<span style="color:#f92672">(</span><span style="color:#a6e22e">GraphSink</span><span style="color:#f92672">(</span>outputPath<span style="color:#f92672">,</span> outputFormat<span style="color:#f92672">))</span> +</span></span></code></pre></div><p>The reader/writer architecture is layered, the graph reader/writer uses dataframe readers/writers for each of its entities. One interesting aspect of implementing the reader was dealing with the input schema. Parquet is self-describing, however as we also support the CSV format, we had to provide a way for correct schema detection and column parsing.</p> +<p>Spark has a facility to derive SparkSQL schema from case classes automatically<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup>. We created case classes for each entity in the <code>Raw</code> dataset. We also created a typeclass <code>EntityTraits</code> associating these classes with their <code>EntityType</code>, so we can summon them (and consequently their SparkSQL schema) in the reader.</p> +<p>The case classes are used during the serialization of the generated dataset too, but more about that later.</p> +<h1 id="factor-generation">Factor generation</h1> +<p>As we already mentioned, factor generation was originally part of the data generator, i.e. factor tables were calculated on the fly and emitted as side outputs. This design had some problems. Auxiliary data structures had to be maintained and interleaved with generation, which violated separation of concerns, consequently hurting readability and maintainability. Also, anything more complicated than entity local aggregates where impossible to express in the original MapReduce framework. To keep the preceding Spark rewrite at a managable scope, the original factor generation code had been removed.</p> +<p>We decided it&rsquo;s best to reintroduce factor generation as a post-processing step that operates on the generated data. This makes it possible to express more complex analytical queries, requires no prior knowledge about the generator, can be done in SparkSQL (making it much simpler), and removes the impact on the generator&rsquo;s performance, so that we can optimize them separately. Since this refactor, we almost tripled the number factor tables (up to 31 to cover both SNB workloads, BI and Interactive). The queries computing of certain factor tables even use <a href="https://spark.apache.org/graphx/">GraphX</a>, which was unimaginable with the previous design.</p> +<p>Factor tables are added by extending a map with a <code>name -&gt; Factor</code> pair. <code>Factor</code> declares is input entities, and accepts a function that receives input <code>DataFrames</code>, and returns a single <code>DataFrame</code> as output.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-scala" data-lang="scala"><span style="display:flex;"><span><span style="color:#66d9ef">val</span> factors <span style="color:#66d9ef">=</span> <span style="color:#a6e22e">Map</span> <span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;personDisjointEmployerPairs&#34;</span> <span style="color:#f92672">-&gt;</span> <span style="color:#a6e22e">Factor</span><span style="color:#f92672">(</span><span style="color:#a6e22e">PersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonKnowsPersonType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">OrganisationType</span><span style="color:#f92672">,</span> <span style="color:#a6e22e">PersonWorkAtCompanyType</span><span style="color:#f92672">)</span> <span style="color:#f92672">{</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">case</span> <span style="color:#a6e22e">Seq</span><span style="color:#f92672">(</span>person<span style="color:#f92672">,</span> personKnowsPerson<span style="color:#f92672">,</span> organisation<span style="color:#f92672">,</span> workAt<span style="color:#f92672">)</span> <span style="color:#66d9ef">=&gt;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> knows <span style="color:#66d9ef">=</span> undirectedKnows<span style="color:#f92672">(</span>personKnowsPerson<span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> company <span style="color:#66d9ef">=</span> organisation<span style="color:#f92672">.</span>where<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;Type&#34;</span> <span style="color:#f92672">===</span> <span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">).</span>cache<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">val</span> personSample <span style="color:#66d9ef">=</span> person +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>orderBy<span style="color:#f92672">(</span>$<span style="color:#e6db74">&#34;id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>limit<span style="color:#f92672">(</span><span style="color:#ae81ff">20</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> personSample +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Person2&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>knows<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;knows&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;knows.person2Id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>workAt<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;workAt&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;workAt.PersonId&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;knows.Person1id&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>join<span style="color:#f92672">(</span>company<span style="color:#f92672">.</span>as<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;Company&#34;</span><span style="color:#f92672">),</span> $<span style="color:#e6db74">&#34;Company.id&#34;</span> <span style="color:#f92672">===</span> $<span style="color:#e6db74">&#34;workAt.CompanyId&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>select<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2id&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.name&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyName&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Company.id&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;companyId&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.creationDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2creationDate&#34;</span><span style="color:#f92672">),</span> +</span></span><span style="display:flex;"><span> $<span style="color:#e6db74">&#34;Person2.deletionDate&#34;</span><span style="color:#f92672">.</span>alias<span style="color:#f92672">(</span><span style="color:#e6db74">&#34;person2deletionDate&#34;</span><span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">)</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span>distinct<span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">},</span> +</span></span><span style="display:flex;"><span> <span style="color:#75715e">/* more factors */</span> +</span></span><span style="display:flex;"><span><span style="color:#f92672">)</span> +</span></span></code></pre></div><p>As you can see, it&rsquo;s not much complicated than using plain SQL, with the added benefit of being able to extract recurring subqueries to functions (e.g. <code>undirectedKnows</code>). Currently, there&rsquo;s no parallelization between different factor tables (although each of them is parallelized internally by Spark). The Factor table writer uses the same componentized architecture as the graph writer, i.e. it uses the dataframe writer under the hood.</p> +<h1 id="revamping-the-data-generators-serializer">Revamping the data generator&rsquo;s serializer</h1> +<p>At this point, both the transformation pipeline and factor generator was ready, however the data generator was still chugging with the old serializer, emitting the IR in CSV. We wanted to move this to Parquet to improve performance and reduce its size, but there was a problem: due to the generator&rsquo;s custom data representation, SparkSQL (and its DataSource API) was off-limits. So we&rsquo;ve bitten the bullet, and rewritten the existing serializer to emit Parquet.</p> +<blockquote> +<p><a href="https://parquet.apache.org/">Parquet</a> is an open source data format that evolved to be the de facto standard for Big Data batch pipelines. It offers a column-oriented, compressed, schemaful representation that is space-efficient and suited for analytic queries. The file format leverages a record shredding and assembly model, which originated at Google. This results in a file that is optimized for query performance and minimizing I/O.</p> +</blockquote> +<p>The new serialization framework is heavily influenced by the design of Java <code>OutputStreams</code>, in the sense that stateful objects are composed to form a pipeline. For example, in case of <em>activities</em>, the input is an activity tree, and the output is a set of rows in multiple files (eg. forum, forumHasTag, post, postHasTag, etc.). The components that take part in activity serialization are shown on the diagram below. The activity tree is iterated (1st component) and the corresponding entity serializer is called (2nd component), which is fed into a component that splits the records (3rd one) among several output streams writing individual files (last).</p> +<p><img src="activity.png" alt="Activity serialization pipeline"></p> +<p>The benefit of this architecture is that only the last component needs to change when we add support for a new output format.</p> +<p>To support Parquet, we made use of row-level serializers available in Hadoop&rsquo;s Parquet library (bundled with SparkSQL), and internal classes in SparkSQL to derive Parquet schema for our entities. Remember how we used case classes for the <code>Raw</code> entities to derive the input schema in the graph reader during dataset transformation? Here we use the same classes (e.g. <code>Forum</code>) and Spark&rsquo;s <code>Encoder</code> framework to encode the entities in Parquet, which means that the generated output remains consistent with <code>DataFrame</code>-based reader, and we spare a lot of code duplication.</p> +<h1 id="optimizations">Optimizations</h1> +<p>After these refactors, we were able to generate the BI dataset with scale factor 10K on 300 i3.4xlarge machines in one hour. Decreasing the number of machines resulted in out of memory errors in the generator. We realized partition sizes (and thus the number of partitions) should be determined based on available memory. Our experiments showed that a machine with 128GB of memory is capable of generating SF3K (scale factor 3000) reliably with 3 blocks<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> per partition given ample disk size to allow for spills (tested with 3.8TB); while less partitions (subsequently, larger block/partition ratio) would introduce OOM errors. Furthermore, we split the data generator output after a certain number of rows written, to fend against the skew between different kinds of entities possibly causing problems during transformation<sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup>. These optimizations enabled us to run SF10K reliably on 4 i3.4xlarge machines in 11 hours (which is still more than 6x reduction in cost). We weren&rsquo;t able to run SF30K run on 10 machines (1 machine / SF3K), even 15 ran out of disk. This non-linear disk use should be investigated further as it complicates calculating cluster sizes for larger scale factors.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf3k_bi <span style="color:#ae81ff">3000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">330</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>./tools/emr/submit_datagen_job.py sf10k_bi <span style="color:#ae81ff">10000</span> parquet bi <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --sf-per-executor <span style="color:#ae81ff">3000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --partitions <span style="color:#ae81ff">1000</span> <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --jar $JAR_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --instance-type i3.4xlarge <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> --bucket $BUCKET_NAME <span style="color:#ae81ff">\ +</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> -- --explode-edges --explode-attrs +</span></span></code></pre></div><p>The above examples working configurations for generating the 3K and 10K BI datasets. The <code>--sf-per-executor</code> option controls the number of worker nodes allocated, in this case 1 node per every 3000 SF, i.e. 1 and 4 nodes correspondingly. The <code>--partitions</code> option controls the total number of partitions, and was calculated based on the number of persons using the formula <code>partitions = ceil(number_of_persons / block_size / 3)</code> to get a maximum of 3 blocks per partition.</p> +<h1 id="conclusion">Conclusion</h1> +<p>These improvements made LDBC SNB datagen more modular, maintainable and efficient, costing under a cent per scale factor to generate the BI dataset, which enables us to generate datasets beyond SF 100K.</p> +<h1 id="footnotes">Footnotes</h1> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>The generator produces hierarchies, such as forum wall with a random number of posts, that have comments, etc. This tree is iterated, and different entities are written to separate files.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>Shameless plug: You can learn more on this from <a href="https://www.dataversity.net/case-study-deriving-spark-encoders-and-schemas-using-implicits/">another blogpost of mine</a>.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>The datagenerator produces blocks of 10,000 persons and their related entities. Entities from different blocks are unrelated (isolated).&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>The maximum row count per file is currently 10M, however, this can be modified with a command line option. We also had an alternative design in mind where this number would have been determined based on the average row size of each entity, however, we stayed with the first version for simplicity.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Fifteenth TUC Meeting + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + Fri, 17 Jun 2022 09:20:00 -0500 + + https://ldbcouncil.org/event/fifteenth-tuc-meeting/ + <p><strong>Organizers:</strong> Gábor Szárnyas, Jack Waudby, Peter Boncz, Alastair Green</p> +<p>LDBC is hosting a <strong>two-day</strong> hybrid workshop, co-located with <a href="https://2022.sigmod.org/venue.shtml">SIGMOD 2022</a> on <strong>June 17-18 (Friday-Saturday)</strong>.</p> +<p>The program consists of 10-15 minute talks followed by a Q&amp;A session. The talks will be recorded and made available online.<br> +The tenative program is the following. <strong>All times are in EDT.</strong></p> +<p>We will have a social event on Friday at 17:30 at <a href="https://elvezrestaurant.com/">El Vez</a> (<a href="https://g.page/ElVezPhilly">Google Maps</a>).</p> +<h4 id="friday-pennsylvania-convention-centerhttpswwwpaconventioncom-room-204bhttps2022sigmodorgprogramshtml">Friday (<a href="https://www.paconvention.com/">Pennsylvania Convention Center</a>, <a href="https://2022.sigmod.org/program.shtml">room 204B</a>)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>09:20</td> +<td>09:30</td> +<td>Peter Boncz (LDBC/CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a>, <a href="https://youtu.be/39BoOIGk9Is">video</a></td> +</tr> +<tr> +<td>09:30</td> +<td>09:45</td> +<td>Alastair Green (LDBC/Birkbeck)</td> +<td>LDBC&rsquo;s fair use policies – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-fair-use-of-the-ldbc-trademark.pdf">slides</a>, <a href="https://youtu.be/7zmCysN4Rpg">video</a></td> +</tr> +<tr> +<td>09:50</td> +<td>10:05</td> +<td>Gábor Szárnyas (LDBC/CWI), Jack Waudby (Newcastle University)</td> +<td>LDBC Social Network Benchmark: Business Intelligence workload v1.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/gabor-szarnyas-the-ldbc-social-network-benchmark-business-intelligence-workload.pdf">slides</a>, <a href="https://youtu.be/AJ96M8_njxE">video</a></td> +</tr> +<tr> +<td>10:10</td> +<td>10:25</td> +<td>Heng Lin (Ant Group)</td> +<td>LDBC Financial Benchmark introduction – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/heng-lin-ldbc-financial-benchmark-introduction.pdf">slides</a>, <a href="https://youtu.be/iBhud_YjafY">video</a></td> +</tr> +<tr> +<td>10:30</td> +<td>11:00</td> +<td><em>coffee break</em></td> +<td></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Chen Zhang (CreateLink)</td> +<td>New LDBC SNB benchmark record by Galaxybase: More than 6 times faster and 70% higher throughput – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/chen-zhang-new-ldbc-snb-benchmark-record-by-galaxybase-more-than-6-times-faster-and-70-percent-higher-throughput.pdf">slides</a>, <a href="https://youtu.be/sMzTsb8iw_Y">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>James Clarkson (Neo4j)</td> +<td>LDBC benchmarks: Promoting good science and industrial consumption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/james-clarkson-ldbc-benchmarks-promoting-good-science-and-industrial-consumption.pdf">slides</a>, <a href="https://youtu.be/VYG1mzcl9qQ">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Oskar van Rest (Oracle)</td> +<td>Creating and querying property graphs in Oracle, on-premise and in the cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oskar-van-rest-creating-and-querying-property-graphs-in-oracle-on-premise-and-in-the-cloud.pdf">slides</a>, <a href="https://youtu.be/2HX2Vixf2gs">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>12:15</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>Conquering LDBC SNB BI at SF-10k – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/mingxi-wu-conquering-ldbc-snb-bi-at-sf10k.pdf">slides</a>, <a href="https://youtu.be/oJbqzQ_t3G8">video</a></td> +</tr> +<tr> +<td>12:20</td> +<td>13:20</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:20</td> +<td>13:35</td> +<td>Altan Birler (Technische Universität München)</td> +<td>Relational databases can handle graphs too! Experiences with optimizing the Umbra RDBMS for LDBC SNB BI – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/altan-birler-relational-databases-can-handle-graphs-too.pdf">slides</a>, <a href="https://youtu.be/cRgbdY3I2i4">video</a></td> +</tr> +<tr> +<td>13:40</td> +<td>13:55</td> +<td>David Püroja (CWI)</td> +<td>LDBC Social Network Benchmark: Interactive workload v2.0 – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/david-puroja-ldbc-snb-interactive-workload-v2.0.pdf">slides</a></td> +</tr> +<tr> +<td>14:00</td> +<td>14:15</td> +<td>Angela Bonifati (Lyon 1 University)</td> +<td>The quest for schemas in graph databases – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/angela-bonifati-the-quest-for-schemas-in-graph-databases.pdf">slides</a>, <a href="https://youtu.be/VT7cx3Jp7V8">video</a></td> +</tr> +<tr> +<td>14:20</td> +<td>14:35</td> +<td>Matteo Lissandrini (Aalborg University)</td> +<td>Understanding graph data representations in triplestores – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/matteo-lissandrini-understanding-graph-data-representations-in-triplestores.pdf">slides</a>, <a href="https://youtu.be/xqVMJZfh_JU">video</a></td> +</tr> +<tr> +<td>14:40</td> +<td>14:55</td> +<td>Wim Martens (University of Bayreuth)</td> +<td>Path representations – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/wim-martens-path-representations.pdf">slides</a>, <a href="https://youtu.be/Ma-E5dwgf-E">video</a></td> +</tr> +<tr> +<td>15:00</td> +<td>15:20</td> +<td>Audrey Cheng (UC Berkeley)</td> +<td>TAOBench: An end-to-end benchmark for social network workloads – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/audrey-cheng-taobench.pdf">slides</a>, <a href="https://youtu.be/1p8AStxS3es">video</a></td> +</tr> +</tbody> +</table> +<h4 id="saturday-philadelphia-marriott-downtownhttpswwwmarriottcomen-ushotelsphldt-philadelphia-marriott-downtown-room-401-402-4th-floor">Saturday (<a href="https://www.marriott.com/en-us/hotels/phldt-philadelphia-marriott-downtown/">Philadelphia Marriott Downtown</a>, room 401-402, 4th floor)</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>finish</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>10:00</td> +<td>10:15</td> +<td>Keith Hare (WG3)</td> +<td>An update on the GQL &amp; SQL/PGQ standards efforts – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/keith-hare-property-graph-standards-process-and-timing.pdf">slides</a>, <a href="https://youtu.be/xFVD3LWnKlc">video</a></td> +</tr> +<tr> +<td>10:20</td> +<td>10:35</td> +<td>Leonid Libkin (ENS Paris)</td> +<td>Pattern matching in GQL and SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/leonid-libkin-pattern-matching-in-gql-and-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/OvGsa0qLANE">video</a></td> +</tr> +<tr> +<td>10:40</td> +<td>10:55</td> +<td>Petra Selmer (Neo4j/WG3)</td> +<td>An overview of GQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/petra-selmer-towards-gql-v1-a-property-graph-query-language-standard.pdf">slides</a>, <a href="https://youtu.be/tncf2FgyIyo">video</a></td> +</tr> +<tr> +<td>11:00</td> +<td>11:15</td> +<td>Alastair Green (LDBC/WG3)</td> +<td>GQL 2.0: A technical manifesto – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/alastair-green-gql-2.0-a-technical-manifesto.pdf">slides</a>, <a href="https://youtu.be/upIvpYy8C2g">video</a></td> +</tr> +<tr> +<td>11:20</td> +<td>11:35</td> +<td>George Fletcher (TU Eindhoven)</td> +<td>PG-Keys (LDBC Property Graph Schema Working Group) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/george-fletcher-pg-keys-keys-for-property-graphs.pdf">slides</a>, <a href="https://youtu.be/_W8-jOtcObc">video</a></td> +</tr> +<tr> +<td>11:40</td> +<td>11:55</td> +<td>Arvind Shyamsundar (Microsoft)</td> +<td>Graph capabilities in Microsoft SQL Server and Azure SQL Database – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/arvind-shyamsundar-graph-capabilities-in-microsoft-sql-server-and-azure-database.pdf">slides</a>, <a href="https://youtu.be/xxV2BfZupGw">video</a></td> +</tr> +<tr> +<td>12:00</td> +<td>13:30</td> +<td><em>lunch (on your own)</em></td> +<td></td> +</tr> +<tr> +<td>13:30</td> +<td>13:45</td> +<td>Daniël ten Wolde (CWI)</td> +<td>Implementing SQL/PGQ in DuckDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/daniel-ten-wolde-implementing-sql-pgq-in-duckdb.pdf">slides</a>, <a href="https://youtu.be/JmSfU0BTH5w">video</a></td> +</tr> +<tr> +<td>13:50</td> +<td>14:05</td> +<td>Oszkár Semeráth, Kristóf Marussy (TU Budapest)</td> +<td>Generation techniques for consistent, realistic, diverse, and scalable graphs – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/oszkar-semerath-generation-techniques-for-consistent-realistic-diverse-and-scalable-graphs.pdf">slides</a>, <a href="https://youtu.be/hB6j6mvh-vA">video</a></td> +</tr> +<tr> +<td>14:10</td> +<td>14:25</td> +<td>Molham Aref (RelationalAI)</td> +<td>Graph Normal Form – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/molham-aref-graph-normal-form.pdf">slides</a>, <a href="https://youtu.be/-kP4Raqr5KA">video</a></td> +</tr> +<tr> +<td>14:30</td> +<td>14:45</td> +<td>Naomi Arnold (Queen Mary University of London)</td> +<td>Temporal graph analysis of the far-right social network Gab – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/naomi-arnold-temporal-graph-analysis-of-the-far-right-social-network-gab.pdf">slides</a>, <a href="https://youtu.be/ugSkFlif4PE">video</a></td> +</tr> +<tr> +<td>14:50</td> +<td>15:05</td> +<td>Domagoj Vrgoč (PUC Chile)</td> +<td>Evaluating path queries in MillenniumDB – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/domagoj-vrgoc-regular-path-queries-in-millenniumdb.pdf">slides</a>, <a href="https://youtu.be/_OzJ6vI7GNU">video</a></td> +</tr> +<tr> +<td>15:10</td> +<td>15:25</td> +<td>Pavel Klinov, Evren Sirin (Stardog)</td> +<td>Stardog&rsquo;s experience with LDBC – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifteenth-tuc-meeting/attachments/evren-sirin-stardog-experience-with-ldbc.pdf">slides</a>, <a href="https://youtu.be/CBrEeOTqGKM">video</a></td> +</tr> +</tbody> +</table> + + + + + Announcing the LDBC Financial Benchmark Task Force + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + Thu, 26 May 2022 00:00:00 +0000 + + https://ldbcouncil.org/post/announcing-the-ldbc-financial-benchmark-task-force/ + <p>We are delighted to announce the set up of the <a href="https://ldbcouncil.org/benchmarks/finbench/">Financial Benchmark (FinBench) task force</a>.</p> +<p>The Financial Benchmark (FinBench) project aims to define a graph database evaluating benchmark and develop a data generation process and a query driver to make the evaluation of the graph database representative, reliable and comparable, especially in financial scenarios, such as anti-fraud and risk control. The FinBench is scheduled to be released in the end of 2022.</p> +<p>Compared to LDBC SNB, the FinBench will differ in application scenarios, data patterns, and workloads, resulting in different schema characteristics, latency bounds, path filters, etc. FinBench is going to redesign the data pattern and workloads, including the data generation, the query driver, and also some other facilities referred to LDBC SNB.</p> +<p>The FinBench Task Force was approved by LDBC on May 16, 2022. The FinBench Task Force is led by Ant Group, and the initial members also include Pometry, Create Link, StarGraph, Ultipa, Katana, Intel, Memgraph (observer) and Koji Annoura (individual member). See the <a href="https://ldbcouncil.org/benchmarks/finbench/ldbc-finbench-work-charter.pdf">Work Charter for FinBench</a></p> +<p>If you are interested in joining FinBench Task Force, please reach out at info at ldbcouncil.org or guozhihui.gzh at antgroup.com.</p> + + + + + Fourteenth TUC Meeting + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + Mon, 16 Aug 2021 16:00:00 +0200 + + https://ldbcouncil.org/event/fourteenth-tuc-meeting/ + <p>LDBC was hosting a one-day hybrid workshop, co-located with <a href="https://vldb.org/2021/">VLDB 2021</a> on <strong>August 16 (Monday) between 16:00–20:00 CEST</strong>.</p> +<p>The physical part of the workshop was held in room Akvariet 2 of the <a href="https://www.tivolihotel.com/">Tivoli Hotel</a> (Copenhagen), while the virtual part was hosted on Zoom. Our programme consisted of talks that provide an overview of LDBC&rsquo;s recent efforts. Moreover, we have invited industry practitioners and academic researchers to present their latest results.</p> +<p>Talks were scheduled to be 10 minutes with a short Q&amp;A session. We had three sessions. Their schedules are shown below.</p> +<h4 id="16001725-cest-ldbc-updates-benchmarks-query-languages">[16:00–17:25 CEST] LDBC updates, benchmarks, query languages</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>16:00</td> +<td>Peter Boncz (CWI)</td> +<td>State of the union – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/peter-boncz-state-of-the-union.pdf">slides</a></td> +</tr> +<tr> +<td>16:05</td> +<td>Gábor Szárnyas (CWI)</td> +<td>Overview of LDBC benchmarks – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/gabor-szarnyas-ldbc-benchmarks.pdf">slides</a></td> +</tr> +<tr> +<td>16:12</td> +<td>Mingxi Wu (TigerGraph)</td> +<td>LDBC Social Network Benchmark results with TigerGraph – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mingxi-wu-tigergraph-snb-preliminary-results.pdf">slides</a></td> +</tr> +<tr> +<td>16:24</td> +<td>Xiaowei Zhu (Ant Group)</td> +<td>Financial Benchmark proposal – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/xiaowei-zhu-financial-benchmark.pdf">slides</a></td> +</tr> +<tr> +<td>16:36</td> +<td>Petra Selmer (Neo4j)</td> +<td>Status report from the Existing Languages Working Group (ELWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/petra-selmer-elwg.pdf">slides</a>, <a href="https://youtu.be/I5A8VuFDhsA">video</a></td> +</tr> +<tr> +<td>16:48</td> +<td>Jan Hidders (Birkbeck)</td> +<td>Status report from the Property Graph Schema Working Group (PGSWG) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/jan-hidders-pgswg.pdf">slides</a>, <a href="https://youtu.be/iEbVi9T-HVk">video</a></td> +</tr> +<tr> +<td>17:00</td> +<td>Keith Hare (JCC Consulting)</td> +<td>Database Language Standards Structure and Process, SQL/PGQ – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/keith-hare-database-language-standards-structure-and-process-sql-pgq.pdf">slides</a>, <a href="https://youtu.be/ZgFCuzods4g">video</a></td> +</tr> +<tr> +<td>17:12</td> +<td>Stefan Plantikow (GQL Editor)</td> +<td>Report on the GQL standard – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/stefan-plantikow-gql.pdf">slides</a>, <a href="https://youtu.be/z0pN5NwKsgc">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="17351845-cest-systems-and-data-structures">[17:35–18:45 CEST] Systems and data structures</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>17:35</td> +<td>Vasileios Trigonakis (Oracle Labs)</td> +<td>PGX.D aDFS: An Almost Depth-First-Search Distributed Graph-Querying System – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasileios-trigonakis-pgxd-adfs.pdf">slides</a>, <a href="https://youtu.be/cv2ZfWRBOek">video</a></td> +</tr> +<tr> +<td>17:47</td> +<td>Matthias Hauck (SAP)</td> +<td>JSON, Spatial, Graph – Multi-model Workloads with SAP HANA Cloud – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/matthias-hauck-json-spatial-graph-sap-hana-cloud.pdf">slides</a>, <a href="https://youtu.be/dgpMJFho6Q8">video</a></td> +</tr> +<tr> +<td>17:59</td> +<td>Nikolay Yakovets (Eindhoven University of Technology)</td> +<td>AvantGraph – <a href="https://youtu.be/z0pN5NwKsgcttachments/nikolay-yakovets-avantgraph.pdf">slides</a>, <a href="https://youtu.be/9M9FOycovTw">video</a></td> +</tr> +<tr> +<td>18:11</td> +<td>Semih Salihoglu (University of Waterloo)</td> +<td>GRainDB: Making RDBMSs Efficient on Graph Workloads Through Predefined Joins – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semih-salihoglu-graindb.pdf">slides</a>, <a href="https://youtu.be/FFK3y6vPHJs">video</a></td> +</tr> +<tr> +<td>18:23</td> +<td>Semyon Grigorev (Saint Petersburg University)</td> +<td>Context-free path querying: Obstacles on the way to adoption – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/semyon-grigorev-cfpq.pdf">slides</a>, <a href="https://youtu.be/pha1xIpEL3I">video</a></td> +</tr> +<tr> +<td>18:35</td> +<td>Per Fuchs (Technical University of Munich)</td> +<td>Sortledton: A universal, transactional graph data structure – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/per-fuchs-sortledton.pdf">slides</a>, <a href="https://youtu.be/33ZjsNN0hhU">video</a></td> +</tr> +</tbody> +</table> +<p><em>coffee break (10 minutes)</em></p> +<h4 id="1855-2000-cest-high-level-approaches-and-benchmarks">[18:55-20:00 CEST] High-level approaches and benchmarks</h4> +<table> +<thead> +<tr> +<th>start</th> +<th>speaker</th> +<th>title</th> +</tr> +</thead> +<tbody> +<tr> +<td>18:55</td> +<td>Angelos-Christos Anadiotis (Ecole Polytechnique and Institut Polytechnique de Paris)</td> +<td>Empowering Investigative Journalism with Graph-based Heterogeneous Data Management – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/angelos-christos-anadiotis-investigative-journalism-graph-data-management.pdf">slides</a>, <a href="https://youtu.be/a1VYjyec8dg">video</a></td> +</tr> +<tr> +<td>19:07</td> +<td>Vasia Kalavri (Boston University)</td> +<td>Learning to partition unbounded graph streams – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/vasia-kalavri-learning-to-partition-unbounded-graph-streams.pdf">slides</a>, <a href="https://youtu.be/PTlUABKWniA">video</a></td> +</tr> +<tr> +<td>19:19</td> +<td>Muhammad Attahir Jibril (TU Ilmenau)</td> +<td>Towards a Hybrid OLTP-OLAP Graph Benchmark – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/muhammad-attahir-jibril-hybrid-oltp-olap-benchmark.pdf">slides</a>, <a href="https://youtu.be/tMBVszTSJXc">video</a></td> +</tr> +<tr> +<td>19:31</td> +<td>Riccardo Tommasini (University of Tartu)</td> +<td>An outlook on Benchmarks for Graph Stream Processing – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/riccardo-tommasini-graph-stream-processing-benchmarks.pdf">slides</a>, <a href="https://youtu.be/HabvJvPXsLc">video</a></td> +</tr> +<tr> +<td>19:43</td> +<td>Mohamed Ragab (University of Tartu)</td> +<td>Benchranking: Towards prescriptive analysis of big graph processing: the case of SparkSQL – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourteenth-tuc-meeting/attachments/mohamed-ragab-benchranking.pdf">slides</a>, <a href="https://youtu.be/mZ8LhGUq7Wg">video</a></td> +</tr> +</tbody> +</table> + + + + + Thirteenth TUC Meeting + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + Tue, 30 Jun 2020 14:00:00 +0000 + + https://ldbcouncil.org/event/thirteenth-tuc-meeting/ + <p>LDBC is pleased to announce its Thirteenth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry – LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a two-day event hosted online. We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Gabor Szarnyas (BME) to register.</p> +<h3 id="snb-task-force">SNB Task Force</h3> +<ul> +<li>Progress report +<ul> +<li>ACID compliance test suite</li> +<li>Integrating deletions to Datagen</li> +<li>Migrating Datagen to Spark</li> +<li>Redesign of BI read queries</li> +<li>Extensions to the driver</li> +</ul> +</li> +<li>Ongoing work +<ul> +<li>Datagen: tuning the distribution of deletes</li> +<li>Interactive 2.0 workload</li> +<li>BI 1.0 workload</li> +</ul> +</li> +</ul> +<p>Zoom links will be sent through email.</p> + + + + + Speeding Up LDBC SNB Datagen + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + Fri, 12 Jun 2020 00:00:00 +0000 + + https://ldbcouncil.org/post/speeding-up-ldbc-snb-datagen/ + <p>LDBC&rsquo;s <a href="#references">Social Network Benchmark [4]</a> (LDBC SNB) is an industrial and academic initiative, formed by principal actors in the field of graph-like data management. Its goal is to define a framework where different graph-based technologies can be fairly tested and compared, that can drive the identification of systems&rsquo; bottlenecks and required functionalities, and can help researchers open new frontiers in high-performance graph data management.</p> +<p>LDBC SNB provides <a href="https://github.com/ldbc/ldbc_snb_datagen">Datagen</a> (Data Generator), which produces synthetic datasets, mimicking a social network&rsquo;s activity during a period of time. Datagen is defined by the charasteristics of realism, scalability, determinism and usability. To address scalability in particular, Datagen has been implemented on the MapReduce computation model to enable scaling out across a distributed cluster. However, since its inception in the early 2010s there has been a tremendous amount of development in the big data landscape, both in the sophistication of distributed processing platforms, as well as public cloud IaaS offerings. In the light of this, we should reevaluate this implementation, and in particular, investigate if Apache Spark would be a more cost-effective solution for generating datasets on the scale of tens of terabytes, on public clouds such as Amazon Web Services (AWS).</p> +<h2 id="overview">Overview</h2> +<p>The benchmark&rsquo;s specification describes a social network <a href="https://github.com/ldbc/ldbc_snb_docs/blob/9253abbde94ec7eaccd366c5d4c15cca30752e36/figures/schema-comfortable.pdf">data model</a> which divides its components into two broad categories: static and dynamic. The dynamic element consists of an evolving network where people make friends, post in forums, comment or like each others posts, etc. In contrast, the static component contains related attributes such as countries, universities and organizations and are fixed values. For the detailed specifications of the benchmark and the Datagen component, see <a href="#references">References</a>.</p> +<p>Datasets are generated in a multi-stage process captured as a sequence of MapReduce steps (shown in the diagram below).</p> +<p><img src="datagen_flow.png" alt=""> \ <em>Figure 1. LDBC SNB Datagen Process on Hadoop</em></p> +<p>In the initialization phase dictionaries are populated and distributions are initialized. In the first generation phase persons are synthesized, then relationships are wired between them along 3 dimensions (university, interest and random). After merging the graph of person relationships, the resulting dataset is output. Following this, activities such as forum posts, comments, likes and photos are generated and output. Finally, the static components are output.</p> +<p><em>Note: The diagram shows the call sequence as implemented. All steps are sequential &ndash; including the relationship generation &ndash;, even in cases when the data dependencies would allow for parallelization.</em></p> +<p>Entities are generated by procedural Java code and are represented as POJOs in memory and as sequence files on disk. Most entities follow a shallow representation, i.e foreign keys (in relational terms) are mapped to integer ids, which makes serialization straightforward.<sup id="fnref:1"><a href="#fn:1" class="footnote-ref" role="doc-noteref">1</a></sup> A notable exception is the Knows edge which contains only the target vertex, and is used as a navigation property on the source Person. The target Person is replaced with only the foreign key augmented with some additional information in order to keep the structure free of cycles. Needless to say, this <em>edge as property</em> representation makes the data harder to handle in SQL than it would be with a flat join table.</p> +<p>Entity generation amounts to roughly one fifth of the main codebase. It generates properties drawn from several random distributions using mutable pRNGs. Determinism is achieved by initializing the pRNGs to seeds that are fully defined by the configuration with constants, and otherwise having no external state in the logic.<sup id="fnref:2"><a href="#fn:2" class="footnote-ref" role="doc-noteref">2</a></sup></p> +<p>Serialization is done by hand-written serializers for the supported output formats (e.g. CSV) and comprises just a bit less than one third of the main codebase. Most of the output is created by directly interacting with low-level HDFS file streams. Ideally, this code should be migrated to higher-level writers that handle faults and give consistent results when the task has to be restarted.</p> +<h2 id="motivations-for-the-migration">Motivations for the migration</h2> +<p>The application is written using Hadoop MapReduce, which is now largely superseded by more modern distributed batch processing platforms, notably Apache Spark. For this reason, it was proposed to migrate Datagen to Spark. The migration provides the following benefits:</p> +<ul> +<li> +<p><strong>Better memory utilization:</strong> MapReduce is disk-oriented, i.e. it writes the output to disk after each reduce stage which is then read by the next MapReduce job. As public clouds provide virtual machines with sufficient RAM to encapsulate any generated dataset, time and money are wasted by the overhead this unnecessary disk I/O incurs. Instead, the intermediate results should be cached in memory where possible. The lack of support for this is a well-known limitation of MapReduce.</p> +</li> +<li> +<p><strong>Smaller codebase:</strong> The Hadoop MapReduce library is fairly ceremonial and boilerplatey. Spark provides a higher-level abstraction that is simpler to work with, while still providing enough control on the lower-level details required for this workload.</p> +</li> +<li> +<p><strong>Small entry cost:</strong> Spark and MapReduce are very close conceptually, they both utilise HDFS under the hood, and run on the JVM. This means that a large chunk of the existing code can be reused, and migration to Spark can, therefore, be completed with relatively small effort. Additionally, MapReduce and Spark jobs can be run on AWS EMR using basically the same HW/SW configuration, which facilitates straightforward performance comparisons.</p> +</li> +<li> +<p><strong>Incremental improvements:</strong> Spark exposes multiple APIs for different workloads and operating on different levels of abstraction. Datagen may initially utilise the lower-level, Java-oriented RDDs (which offer the clearest 1 to 1 mapping when coming from MapReduce) and gradually move towards DataFrames to support Parquet output in the serializers and maybe unlock some SQL optimization capabilities in the generators later down the road.</p> +</li> +<li> +<p><strong>OSS, commodity:</strong> Spark is one of the most widely used open-source big data platforms. Every major public cloud provides a managed offering for Spark. Together these mean that the migration increases the approachability and portability of the code.</p> +</li> +</ul> +<h2 id="first-steps">First steps</h2> +<p>The first milestone is a successful run of LDBC Datagen on Spark while making the minimum necessary amount of code alterations. This entails the migration of the Hadoop wrappers around the generators and serializers. The following bullet-points summarize the key notions that cropped up during the process.</p> +<ul> +<li> +<p><strong>Use your memory:</strong> A strong focus was placed on keeping the call sequence intact, so that the migrated code evaluates the same steps in the same order, but with data passed as RDDs. It was hypothesised that the required data could be either cached in memory entirely at all times, or if not, regenerating them would still be faster than involving the disk I/O loop (e.g. by using <code>MEMORY_AND_DISK</code>). In short, the default caching strategy was used everywhere.</p> +</li> +<li> +<p><strong>Regression tests:</strong> Lacking tests apart from an id uniqueness check, meant there were no means to detect bugs introduced by the migration. Designing and implementing a comprehensive test suite was out of scope, so instead, regression testing was utilised, with the MapReduce output as the baseline. The original output mostly consists of Hadoop sequence files which can be read into Spark, allowing comparisons to be drawn with the output from the RDD produced by the migrated code.</p> +</li> +<li> +<p><strong>Thread-safety concerns:</strong> Soon after migrating the first generator and running the regression tests, there were clear discrepancies in the output. These only surfaced when the parallelization level was set greater than 1. This indicated the presence of potential race conditions. Thread-safety wasn&rsquo;t a concern in the original implementation due to the fact that MapReduce doesn&rsquo;t use thread-based parallelization for mappers and reducers.<sup id="fnref:3"><a href="#fn:3" class="footnote-ref" role="doc-noteref">3</a></sup> In Spark however, tasks are executed by parallel threads in the same JVM application, so the code is required to be thread-safe. After some debugging, a bug was discovered originating from the shared use of java.text.SimpleDateFormat (notoriously known to be not thread-safe) in the serializers. This was resolved simply by changing to java.time.format.DateTimeFormatter. There were multiple instances of some static field on an object being mutated concurrently. In some cases this was a temporary buffer and was easily resolved by making it an instance variable. In another case a shared context variable was used, which was resolved by passing dedicated instances as function arguments. Sadly, the Java language has the same syntax for accessing locals, fields and statics, <sup id="fnref:4"><a href="#fn:4" class="footnote-ref" role="doc-noteref">4</a></sup> which makes it somewhat harder to find potential unguarded shared variables.</p> +</li> +</ul> +<h2 id="case-study-person-ranking">Case study: Person ranking</h2> +<p>Migrating was rather straightforward, however, the so-called person ranking step required some thought. The goal of this step is to organize persons so that similar ones appear close to each other in a deterministic order. This provides a scalable way to cluster persons according to a similarity metric, as introduced in the <a href="#references">S3G2 paper [3]</a>.</p> +<h3 id="the-original-mapreduce-version">The original MapReduce version</h3> +<p><img src="person_ranking.svg" alt=""> \ <em>Figure 2. Diagram of the MapReduce code for ranking persons</em></p> +<p>The implementation, shown in pseudocode above, works as follows:</p> +<ol> +<li>The equivalence keys are mapped to each person and fed into TotalOrderPartitioner which maintains an order sensitive partitioning while trying to emit more or less equal sized groups to keep the data skew low.</li> +<li>The reducer keys the partitions with its own task id and a counter variable which has been initialized to zero and incremented on each person, establishing a local ranking inside the group. The final state of the counter (which is the total number of persons in that group) is saved to a separate &ldquo;side-channel&rdquo; file upon the completion of a reduce task.</li> +<li>In a consecutive reduce-only stage, the global order is established by reading all of these previously emitted count files in the order of their partition number in each reducer, then creating an ordered map from each partition number to the corresponding cumulative count of persons found in all preceding ones. This is done in the setup phase. In the reduce function, the respective count is incremented and assigned to each person.</li> +</ol> +<p>Once this ranking is done, the whole range is sliced up into equally sized blocks, which are processed independently. For example, when wiring relationships between persons, only those appearing in the same block are considered.</p> +<h3 id="the-migrated-version">The migrated version</h3> +<p>Spark provides a sortBy function which takes care of the first step above in a single line. The gist of the problem remains collecting the partition sizes and making them available in a later step. While the MapReduce version uses a side output, in Spark the partition sizes are collected in a separate job and passed into the next phase using a broadcast variable. The resulting code size is a fraction of the original one.</p> +<h2 id="benchmarks">Benchmarks</h2> +<p>Benchmarks were carried out on AWS <a href="https://aws.amazon.com/emr/">EMR</a>, originally utilising <a href="https://aws.amazon.com/ec2/instance-types/i3/">i3.xlarge</a> instances because of their fast NVMe SSD storage and ample amount of RAM.</p> +<p>The application parameter hadoop.numThreads controls the number of reduce threads in each Hadoop job for the MapReduce version and the number of partitions in the serialization jobs in the Spark one. For MapReduce, this was set to n_nodes, i.e. the number of machines; experimentation yield slowdowns for higher values. The Spark version on the other hand, performed better with this parameter set to n_nodes * v_cpu. The scale factor (SF) parameter determines the output size. It is defined so that one SF unit generates around 1 GB of data. That is, SF10 generates around 10 GB, SF30 around 30 GB, etc. It should be noted however, that incidentally the output was only 60% of this in these experiments, stemming from two reasons. One, update stream serialization was not migrated to Spark, due to problems in the original implementation. Of course, for the purpose of faithful comparison the corresponding code was removed from the MapReduce version as well before executing the benchmarks. This explains a 10% reduction from the expected size. The rest can be attributed to incorrectly tuned parameters.<sup id="fnref:5"><a href="#fn:5" class="footnote-ref" role="doc-noteref">5</a></sup> The MapReduce results were as follows:</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>16</td> +<td>1.60</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>34</td> +<td>1.13</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>40</td> +<td>1.20</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>MapReduce</td> +<td>i3.xlarge</td> +<td>44</td> +<td>1.32</td> +</tr> +</tbody> +</table> +<p>It can be observed that the runtime per scale factor only increases slowly, which is good. The metric charts show an underutilized, bursty CPU. The bursts are supposedly interrupted by the disk I/O parts when the node is writing the results of a completed job. It can also be seen that the memory only starts to get consumed after 10 minutes of the run have assed.</p> +<p><img src="mr_sf100_cpu_load.png" alt=""> <br> +<em>Figure 3. CPU Load for the Map Reduce cluster is bursty and less than<br> +50% on average (SF100, 2nd graph shows master)</em></p> +<p><img src="mr_sf100_mem_free.png" alt=""> <br> +<em>Figure 4. The job only starts to consume memory when already 10 minutes<br> +into the run (SF100, 2nd graph shows master)</em></p> +<p>Let&rsquo;s see how Spark fares.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>10</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>10</td> +<td>1.00</td> +</tr> +<tr> +<td>30</td> +<td>1</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>21</td> +<td>0.70</td> +</tr> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>27</td> +<td>0.81</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>36</td> +<td>1.08</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>i3.xlarge</td> +<td>47</td> +<td>1.41</td> +</tr> +</tbody> +</table> +<p>A similar trend here, however the run times are around 70% of the MapReduce version. It can be seen that the larger scale factors (SF1000 and SF3000) yielded a long runtime than expected. On the metric charts of SF100 the CPU shows full utilization, except at the end, when the results are serialized in one go and the CPU is basically idle (the snapshot of the diagram doesn&rsquo;t include this part unfortunately). Spark can be seen to have used up all memory pretty fast even in case of SF100. In case of SF1000 and SF3000, the nodes are running so low on memory that most probably some of the RDDs have to be calculated multiple times (no disk level serialization was used here), which seem to be the most plausible explanation for the slowdowns experienced. In fact, the OOM errors encountered when running SF3000 supports this hypothesis even further. It was thus proposed to scale up the RAM in the instances. The CPU utilization hints that adding some extra vCPUs as well can further yield speedup.</p> +<p><img src="spark_sf100_cpu_load.png" alt=""> <br> +<em>Figure 5. Full CPU utilization for Spark (SF100, last graph shows<br> +master)</em></p> +<p><img src="spark_sf100_mem_free.png" alt=""> <br> +<em>Figure 6. Spark eats up memory fast (SF100, 2nd graph shows master)</em></p> +<p>i3.2xlarge would have been the most straightforward option for scaling up the instances, however the humongous 1.9 TB disk of this image is completely unnecessary for the job. Instead the cheaper r5d.2xlarge instance was utilised, largely identical to i3.2xlarge, except it <em>only</em> has a 300 GB SSD.</p> +<table> +<thead> +<tr> +<th>SF</th> +<th>workers</th> +<th>Platform</th> +<th>Instance Type</th> +<th>runtime (min)</th> +<th>runtime * worker/SF (min)</th> +</tr> +</thead> +<tbody> +<tr> +<td>100</td> +<td>3</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>16</td> +<td>0.48</td> +</tr> +<tr> +<td>300</td> +<td>9</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>21</td> +<td>0.63</td> +</tr> +<tr> +<td>1000</td> +<td>30</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>26</td> +<td>0.78</td> +</tr> +<tr> +<td>3000</td> +<td>90</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +<tr> +<td>10000</td> +<td>303</td> +<td>Spark</td> +<td>r5d.2xlarge</td> +<td>25</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<p>The last column clearly demonstrates our ability to keep the cost per scale factor unit constant.</p> +<h2 id="next-steps">Next steps</h2> +<p>The next improvement is refactoring the serializers so they use Spark&rsquo;s high-level writer facilities. The most compelling benefit is that it will make the jobs fault-tolerant, as Spark maintains the integrity of the output files in case the task that writes it fails. This makes Datagen more resilient and opens up the possibility to run on less reliable hardware configuration (e.g. EC2 spot nodes on AWS) for additional cost savings. They will supposedly also yield some speedup on the same cluster configuration.</p> +<p>As already mentioned, the migration of the update stream serialization was ignored due to problems with the original code. Ideally, they should be implemented with the new serializers.</p> +<p>The Spark migration also serves as an important building block for the next generation of LDBC benchmarks. As part of extending the SNB benchmark suite, the SNB task force has recently extended Datagen with support for <a href="#references">generating delete operations [1]</a>. The next step for the task force is to fine-tune the temporal distributions of these deletion operations to ensure that the emerging sequence of events is realistic, i.e. the emerging distribution resembles what a database system would experience when serving a real social network.</p> +<h2 id="acknowledgements">Acknowledgements</h2> +<p>This work is based upon the work of Arnau Prat, Gábor Szárnyas, Ben Steer, Jack Waudby and other LDBC contributors. Thanks for your help and feedback!</p> +<h2 id="references">References</h2> +<p>[1] <a href="https://ldbcouncil.org/docs/papers/datagen-deletes-grades-nda-2020.pdf">Supporting Dynamic Graphs and Temporal Entity Deletions in the LDBC Social Network Benchmark&rsquo;s Data Generator</a></p> +<p>[2] <a href="https://www.youtube.com/watch?v=ZQOLuCOOpSI">9th TUC Meeting &ndash; LDBC SNB Datagen Update &ndash; Arnau Prat (UPC)</a> - <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">slides</a></p> +<p>[3] <a href="https://research.vu.nl/en/publications/s3g2-a-scalable-structure-correlated-social-graph-generator">S3G2: a Scalable Structure-correlated Social Graph Generator</a></p> +<p>[4] <a href="https://arxiv.org/abs/2001.02299">The LDBC Social Network Benchmark</a></p> +<p>[5] <a href="https://ldbcouncil.org/">LDBC</a> - <a href="https://github.com/ldbc">LDBC GitHub organization</a></p> +<div class="footnotes" role="doc-endnotes"> +<hr> +<ol> +<li id="fn:1"> +<p>Also makes it easier to map to a tabular format thus it is a SQL friendly representation.&#160;<a href="#fnref:1" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:2"> +<p>It&rsquo;s hard to imagine this done declaratively in SQL.&#160;<a href="#fnref:2" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:3"> +<p>Instead, multiple YARN containers have to be used if you want to parallelize on the same machine.&#160;<a href="#fnref:3" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:4"> +<p>Although editors usually render these using different font styles.&#160;<a href="#fnref:4" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +<li id="fn:5"> +<p>With the addition of deletes, entities often get inserted and deleted during the simulation (which is normal in a social network). During serialization, we check for such entities and omit them. However, we forgot to calculate this when determining the output size, which we will amend when tuning the distributions.&#160;<a href="#fnref:5" class="footnote-backref" role="doc-backlink">&#x21a9;&#xfe0e;</a></p> +</li> +</ol> +</div> + + + + + Twelfth TUC Meeting + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + Fri, 05 Jul 2019 08:30:00 +0100 + + https://ldbcouncil.org/event/twelfth-tuc-meeting/ + <p>LDBC is pleased to announce its Twelfth Technical User Community (TUC) meeting.</p> +<p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmarks and graph standards, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j, TigerGraph and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event on the last Friday of <strong><a href="https://sigmod2019.org/">SIGMOD/PODS 2019</a></strong> in Amsterdam, The Netherlands, in the conference venue of <strong><a href="http://sigmod2019.org/conf_venue">Beurs van Berlage</a></strong>. The room is the Mendes da Silva kamer. Please check its tips for <strong><a href="http://sigmod2019.org/accommodation">accommodation in Amsterdam</a></strong>.</p> +<p>Note also that at SIGMOD/PODS in Amsterdam on Sunday, June 30, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2019">GRADES-NDA 2019</a>, that may be of interest to our audience (this generally holds for the whole SIGMOD/PODS program, of course).</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>You need to be registered in order to get into the SIGMOD/PODS venue. Friday, July 5, is the final, workshop, day of SIGMOD/PODS, and the LDBC TUC meeting joins the other workshops for coffee and lunch.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management.</p> +<p><strong>Talk proposals can be sent to Peter Boncz</strong>, who is also the local organizer. <strong>Please also send your slides to this email for archiving on this site.</strong></p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting, there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The morning slot (08:30-10:30) is reserved for an LDBC Board Meeting, to which in principle only LDBC directors are invited (that meeting will be held in the same room).</p> +<p>The TUC meeting will start on Friday morning after the morning coffee break of SIGMOD/PODS 2019 (<strong>room: Mendes da Silva kamer</strong>):</p> +<p>08:30-10:30 LDBC Board Meeting (non-public)</p> +<p>10:30-11:00 Coffee</p> +<p>11:00-12:45 Session 1: Graph Benchmarks</p> +<ul> +<li> +<p>11:00-11:05 Welcome &amp; introduction</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230404.pdf">11:05-11:45 Gabor Szarnyas (BME), Benjamin Steer (QMUL), Jack Waudby (Newcastle University): Business Intelligence workload: Progress report and roadmap</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706117.pdf">11:45-12:00 Frank McSherry (Materialize): Experiences implementing LDBC queries in a dataflow system</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706118.pdf">12:00-12:25 Vasileios Trigonakis (Oracle): Evaluating a new distributed graph query engine with LDBC: Experiences and limitations</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706130.pdf">12:25-12:45 Ahmed Musaafir (VU Amsterdam): LDBC Graphalytics</a></p> +</li> +</ul> +<p>12:45-14:00 Lunch</p> +<p>14:00-16:05 Session 2: Graph Query Languages</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706120.pdf">14:00-14:25 Juan Sequeda (Capsenta): Property Graph Schema Working Group: A progress report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706121.pdf">14:25-14:50 Stefan Plantikow (Neo4j): GQL: Scope and features</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706122.pdf">report</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706119.pdf">14:50-15:15 Vasileios Trigonakis (Oracle): Property graph extensions for the SQL standard</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706129.pdf">15:15-15:40 Alin Deutsch (TigerGraph): Modern graph analytics support in GSQL, TigerGraph&rsquo;s query language</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/112230401.pdf">15:40-16:05 Jan Posiadała (Nodes and Edges, Poland): Executable semantics of graph query language</a></p> +</li> +</ul> +<p>16:05-16:30 Coffee</p> +<p>16:30-17:50 Session 3: Graph System Performance</p> +<ul> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111968258.pdf">16:30-16:50 Per Fuchs (CWI): Fast, scalable WCOJ graph-pattern matching on in-memory graphs in Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706124.pdf">16:50-17:10 Semih Salihoglu (University of Waterloo): Optimizing subgraph queries with a mix of tradition and modernity</a> <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706116.pptx">pptx</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706128.pdf">17:10-17:30 Roi Lipman (RedisGraph): Evaluating Cypher queries and procedures as algebraic operations within RedisGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/twelfth-tuc-meeting/attachments/106233859/111706133.pdf">17:30-17:50 Alexandru Uta (VU Amsterdam): Low-latency Spark queries on updatable data</a></p> +</li> +</ul> +<p>If there is interest, we will organize a social dinner on Friday evening for LDBC attendees.</p> + + + + + Eleventh TUC Meeting + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + Fri, 08 Jun 2018 08:30:00 -0500 + + https://ldbcouncil.org/event/eleventh-tuc-meeting/ + <p>LDBC Technical User Community meetings serve to (1) learn about progress in the LDBC task forces on graph benchmark development, (2) to give feedback on these, and (3) hear about user experiences with graph data management technologies or (4) learn about new graph technologies from researchers or industry &ndash; LDBC counts Oracle, IBM, Intel, Neo4j and Huawei among its members.</p> +<p>This TUC meeting will be a one-day event preceding the <a href="https://sigmod2018.org/">SIGMOD/PODS 2018</a> conference in Houston, Texas (not too far away, the whole next week). Note also that at SIGMOD/PODS in Houston on Sunday 10, there is a research workshop on graph data management technology called <a href="https://sites.google.com/site/gradesnda2018/">GRADES-NDA 2018</a> as well, so you might combine travel.</p> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a> to register.</p> +<p><strong>=&gt; registration is free, but required &lt;=</strong></p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz (<a href="mailto:boncz@cwi.nl">boncz@cwi.nl</a>) and Larri (<a href="mailto:larri@ac.upc.ed">larri@ac.upc.edu</a>). Local organizer is Juan Sequeda (<a href="mailto:juanfederico@gmail.com">juanfederico@gmail.com</a>).</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its interactive, business analytics and graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges and products</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00:</p> +<ul> +<li> +<p>10:30-10:35 Peter Boncz (CWI) - introduction to the LDBC TUC meeting</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090478.pdf">10:35-11:00 Juan Sequeda (Capsenta) - Announcing: gra.fo</a></p> +</li> +<li> +<p>11:00-11:30 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090466.pdf">11:30-11:55 Gabor Szarnyas (BME) - LDBC benchmarks: three aspects of graph processing</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090463.pdf">11:55-12:20 Peter Boncz (CWI) - G-CORE: a composable graph query language by LDBC</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090472.pdf">12:20-12:45 Yinglong Xia (Huawei) - Graph Engine for Cloud AI</a></p> +</li> +<li> +<p>12:45-14:00 lunch</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090474.pdf">14:00-14:25 Stefan Plantikow (Neo4j) - Composable Graph Queries and Multiple Named Graphs in Cypher for Apache Spark</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090481.pdf">14:25-14:50 Oskar van Rest (Oracle) - Analyzing Stack Exchange data using Property Graph in Oracle</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99090485.pdf">14:50-15:15 Brad Bebee (Amazon) - Neptune: the AWS graph management service</a></p> +</li> +<li> +<p>15:15-15:40 coffee break</p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99811329.pdf">15:40-16:05 Bryon Jacob (data.world): Broadening the Semantic Web</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99287041.pdf">16:05-16:30 Jason Plurad (IBM) - Graph Computing with JanusGraph</a></p> +</li> +<li> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99745793.pdf">16:30-16:55 Arthur Keen (Cambridge Semantics): AnzoGraph</a></p> +</li> +<li> +<p><a href="http://relational.ai/">16:55-17:20 Molham Aref (relational.ai)</a>) - Introducing.. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eleventh-tuc-meeting/attachments/91422722/99418113.pdf">relational.ai</a></p> +</li> +<li> +<p>18:00 - 20:00 social dinner in Austin (sponsored by Intel Corp.), Coopers BBQ, 217 Congress Ave, Austin, TX 78701</p> +</li> +</ul> +<h3 id="location">Location</h3> +<p>The TUC will be held at the <a href="https://www.cs.utexas.edu/">University of Texas at Austin, Department of Computer Science</a> in the <a href="https://www.google.com/maps/place/The+University+of+Texas:+Department+of+Computer+Science/@30.2860955,-97.737582,18z/data=!4m5!3m4!1s0x0:0x12edecc8226b3241!8m2!3d30.2862279!4d-97.7365348">Gates Dell Complex (GDC): 2317 Speedway, Austin TX, 78712</a> Room: GDC 6.302</p> +<p>The GDC building has a North and a South building. GDC 6.302 is in the North building. When you enter the main entrance, the North building is on the left and it is served by a pair of elevators. You can take or the elevator to the 6th floor. Exit the elevator on the 6th floor. Turn left, right, left.</p> +<h3 id="from-austin-to-sigmodpods-houston-on-saturday-june-9">From Austin to SIGMOD/PODS (Houston) on Saturday June 9</h3> +<p>Many of the attendees will be going to SIGMOD/PODS which will be held in Houston.</p> +<h4 id="bus">Bus</h4> +<p>One option is to take a <a href="https://us.megabus.com/journey-planner/journeys?days=1&amp;concessionCount=0&amp;departureDate=2018-06-09&amp;destinationId=318&amp;inboundOtherDisabilityCount=0&amp;inboundPcaCount=0&amp;inboundWheelchairSeated=0&amp;nusCount=0&amp;originId=320&amp;otherDisabilityCount=0&amp;pcaCount=0&amp;totalPassengers=1&amp;wheelchairSeated=0">MegaBus that departs from downtown Austin and arrives at downtown Houston</a>.</p> +<p>There is a bus that departs at 12:00PM and arrives at 3:00pm. Cost is $20 (as of April 23).</p> +<p>If you want to spend the day in Austin, there is a bus that departs at 9:55PM and arrives at 12:50am. Cost is $5 (as of April 23).</p> + + + + + Tenth TUC Meeting + https://ldbcouncil.org/event/tenth-tuc-meeting/ + Fri, 01 Sep 2017 10:30:00 +0100 + + https://ldbcouncil.org/event/tenth-tuc-meeting/ + <p>This will be a one-day event at the <a href="http://www.vldb.org/2017">VLDB 2017</a> conference in Munich, Germany on September 1, 2017.</p> +<p>Topics and activities of interest in these TUC meetings are:</p> +<ul> +<li>Presentation on graph data management usage scenarios.</li> +<li>Presentation of the benchmarking results for the different benchmarks, as well as the graph query language task force.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested to attend the event, please, contact Adrian Diaz (UPC) at <a href="mailto:adiaz@ac.upc.edu">adiaz@ac.upc.edu</a> to register; registration is free, but required.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals are handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be:</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Intelligence and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Friday morning, with a program from 10:30-17:00</p> +<p>10:30-12:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87588865.pdf">Peter Boncz (CWI): GraphQL task force update - the G-CORE proposal</a> (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868018.pptx">pptx</a>)</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868008.pdf">Gabor Szarnyas (Budapest University of Technology and Economics Hungarian Academy of Sciences): Updates on the Social Network Benchmark BI Workload</a></li> +<li>Alexandru Iosup, Wing Lung Ngai (VU/TU Delft): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868014.pdf">LDBC Graphalytics v0.9</a>, <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868013.pdf">Graphalytics Global Competition and Graphalytics Custom Benchmark</a></li> +</ul> +<p>12:00-13:30: lunch break</p> +<p>13:30-15:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868024.pdf">Arnau Prat (UPC): Datasynth: Democratizing property graph generation</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/86868026.pdf">Marcus Paradies (SAP): SAP HANA GraphScript</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031809.pdf">Yinglong Xia (Huawei): The EYWA Graph Engine in a Cloud AI Platform</a></li> +<li>Gaétan Hains (Huawei): Cost semantics for graph queries</li> +</ul> +<p>15:00-15:30: break</p> +<p>15:30-17:00: TUC session (public)</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87031812.pdf">Petra Selmer and Stefan Plantikow (Neo4j): openCypher Developments in 2017</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/87195650.pdf">Markus Kaindl (Springer): SN SciGraph &ndash; Building a Linked Data Knowledge Graph for the Scholarly Publishing Domain</a></li> +<li>Irini Fundulaki (FORTH): The HOBBIT Link Discovery and Versioning Benchmarks</li> +<li>Ghislain Atemezing (Mondeca): Benchmarking Enterprise RDF stores with Publications Office Dataset</li> +</ul> +<p>Speakers should aim for a <strong>20-minute talk</strong>.</p> +<p>Further:</p> +<ul> +<li>on Friday evening (19:00-21:00) there will be a <strong>social dinner</strong> at <a href="https://www.loewenbraeukeller.com/en/pub-and-beer-garden/">Löwenbräukeller</a>, sponsored and arranged by LDBC member Huawei (who have their European Research Center in Munich).</li> +<li>on Friday morning (8:30-10:30) there will be a meeting of the LDBC board of directors, but this meeting is not public.</li> +</ul> +<h3 id="venue">Venue</h3> +<p>The Technical University of Munich (TUM) is hosting that week the <a href="http://www.vldb.org/2017">VLDB conference</a>; on the day of the TUC meeting the main conference will have finished, but there will be a number of co-located workshops ongoing, and the TUC participants will blend in with that crowd for the breaks and lunch.</p> +<p>The TUC meeting will be held in in <strong>Room 2607</strong> alongside the VLDB workshops that day (MATES, ADMS, DMAH, DBPL and BOSS).</p> +<p><strong>address: Technische Universität München (TUM), Arcisstraße 21, 80333 München</strong></p> +<p><a href="https://www.google.nl/maps/place/Technische+Universit%C3%A4t+M%C3%BCnchen/@48.14966,11.5656715,17z/data=!3m1!4b1!4m5!3m4!1s0x479e7261336d8c11:0x79a04d44dc5bf19d!8m2!3d48.14966!4d11.5678602?hl=en">Google Maps</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920002.jpg" alt=""><br> +<img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/tenth-tuc-meeting/attachments/81920005/81920003.jpg" alt=""></p> + + + + + Ninth TUC Meeting + https://ldbcouncil.org/event/ninth-tuc-meeting/ + Thu, 09 Feb 2017 15:07:18 -0400 + + https://ldbcouncil.org/event/ninth-tuc-meeting/ + <p>LDBC is pleased to announce its Ninth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> in Walldorf, Germany on February 9+10, 2017.</p> +<p>This will be the third TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>;</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is related to graph data management. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<h3 id="agenda">Agenda</h3> +<p>In the TUC meeting there will be</p> +<ul> +<li>updates on progress with LDBC benchmarks, specifically the Social Network Benchmark (SNB) and its Interactive, Business Inalytics and Graphalytics workloads.</li> +<li>talks by data management practitioners highlighting graph data management challenges</li> +<li>selected scientific talks on graph data management technology</li> +</ul> +<p>The meeting will start on Thursday morning, with a program from 09:00-18:00, interrupted by a lunch break.</p> +<p>Thursday evening (19:00-21:00) there will be a <strong>social dinner</strong> in Heidelberg.</p> +<p>Friday morning the event resumes from 9:00-12:00. In the afternoon, there is a (closed) LDBC Board of Directors meeting (13:00-16:30) at the same venue.</p> +<h4 id="social-dinner">Social Dinner</h4> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235334.png" alt=""></p> +<p><strong>Address: Hauptstraße 217, 69117 Heidelberg</strong><br> +<strong>Time: 19:00 / 7pm</strong></p> +<p>(See attachments at the bottom of the page)</p> +<h5 id="thursday">Thursday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>Welcome and logistics - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>9:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235329.pdf">Intro + state of the LDBC - Josep Lluis Larriba Pey</a> (UPC)</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235338.pdf">LDBC Graph QL task force</a> - Hannes Voigt (TU Dresden)</td> +</tr> +<tr> +<td>9:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235335.pdf">PGQL Status Update and Comparison to LDBC&rsquo;s Graph QL proposals</a> - Oskar van Rest (Oracle Labs)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628546.pdf">Adding shortest-paths to MonetDB</a> - Dean de Leo (CWI)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431939.pdf">Evolving Cypher for processing multiple graphs</a> - Stefan Plantikow (Neo Technology)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235346.pdf">Standardizing Graph Database Functionality - An Invitation to Collaborate</a> - Jan Michels (ISO/ANSI SQL, Oracle)&quot;</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75235343.pdf">Dgraph: Graph database for production environment</a> - Tomasz Zdybal (Dgraph.io)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431945.pdf">LDBC Graphalytics: Current Capabilities, Upcoming Features, and Long-Term Roadmap</a> - Alexandru Iosup (TU Delft)</td> +</tr> +<tr> +<td>13:20</td> +<td>LDBC Graphalytics: Demo of the Live Archive and Competition Features - Tim Hegeman (TU Delft)</td> +</tr> +<tr> +<td>13:40</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431942.pdf">LDBC SNB Datagen Update</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431943.pdf">LDBC SNB Business Intelligence Workload: Chokepoint Analysis</a> - Arnau Prat (UPC)</td> +</tr> +<tr> +<td>14:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431947.pdf">LDBC Benchmark Cost Specification</a> (+discussion) - Moritz Kaufmann (TU Munich)</td> +</tr> +<tr> +<td>14:40</td> +<td>coffee break</td> +</tr> +<tr> +<td>15:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76316673.pdf">EYWA: the Distributed Graph Engine in Huawei MIND Platform</a> (Yinglong Xia)</td> +</tr> +<tr> +<td>15:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75431949.pdf">Graph Processing in SAP HANA</a> - Marcus Paradies (SAP)</td> +</tr> +<tr> +<td>15:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75628563.pdf">Distributed Graph Analytics with Gradoop</a> - Martin Junghanns (Univ Leipzig)</td> +</tr> +<tr> +<td>16:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152834.pdf">Distributed graph flows: Cypher on Flink and Gradoop</a> - Max Kießling (Neo Technology)</td> +</tr> +<tr> +<td>16:30</td> +<td>closing - Peter Boncz</td> +</tr> +<tr> +<td>17:30</td> +<td>end</td> +</tr> +</tbody> +</table> +<h5 id="friday">Friday</h5> +<table> +<thead> +<tr> +<th>start time</th> +<th>title – speaker</th> +</tr> +</thead> +<tbody> +<tr> +<td>9:00</td> +<td>welcome - Peter Boncz</td> +</tr> +<tr> +<td>9:20</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152833.pdf">Graph processing in obi4wan</a> - Frank Smit (OBI4WAN)</td> +</tr> +<tr> +<td>9:40</td> +<td>Graph problems in the space domain - Albrecht Schmidt (ESA)</td> +</tr> +<tr> +<td>10:00</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/75792387.pdf">Medical Ontologies for Healthcare</a> - Michael Neumann (SAP)</td> +</tr> +<tr> +<td>10:20</td> +<td>coffee</td> +</tr> +<tr> +<td>10:50</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76447745.pdf">The Train Benchmark: Cross-Technology Performance Evaluation of Continuous Model Queries</a> - Gabor Szarnyas (BME)</td> +</tr> +<tr> +<td>11:10</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76021761.pdf">Efficient sparse matrix computations and their generalization to graph computing applications</a> - Albert-Jan Yzelman (Huawei)</td> +</tr> +<tr> +<td>11:30</td> +<td><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/76152837.pdf">Experiments on Semantic Publishing Benchmark with large scale real news and LOD data at FactForge</a> - Atanas Kyriakov (Ontotext)</td> +</tr> +<tr> +<td>12:00</td> +<td>lunch</td> +</tr> +<tr> +<td>13:00</td> +<td>LDBC Board of Directors Meeting</td> +</tr> +<tr> +<td>17:00</td> +<td>end</td> +</tr> +</tbody> +</table> +<h3 id="logistics">Logistics</h3> +<h5 id="important-things-to-know"><strong>Important things to know</strong></h5> +<p>The following PDF guide provides additional information, such as recommended restaurants as well as sightseeing spots: <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">link</a></p> +<h5 id="venue"><strong>Venue</strong></h5> +<p>The TUC meeting will be held in the <a href="https://websmp201.sap-ag.de/~sapidp/011000358700001204882013E.pdf">SAP Headquarters</a> at the SAP Guesthouse Kalipeh (<a href="https://www.kalipeh.com">https://www.kalipeh.com</a>). The address is:</p> +<p><strong>WDF 44 / SAP Guesthouse Kalipeh<br> +Dietmar-Hopp-Allee 15<br> +69190 Walldorf<br> +Germany</strong></p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p><a href="https://www.google.com/maps/place/SAP+Guesthouse+Kalipeh/@49.2951903,8.6436224,17z/data=!3m1!4b1!4m5!3m4!1s0x4797bea343a566af:0xd70698f3503ab74b!8m2!3d49.2951868!4d8.6458111">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/ninth-tuc-meeting/attachments/59277315/69042180.png" alt=""></p> +<h4 id="getting-there"><strong>Getting there</strong></h4> +<h5 id="by-plane"><strong>By plane</strong></h5> +<p>There are two airports close to SAP&rsquo;s headquarter: Frankfurt Airport (FRA) and Stuttgart-Echterdingen Airport (STR). The journey from Frankfurt Airport to SAP headquarters takes about one hour by car, while it takes slightly longer from Stuttgart- Echterdingen Airport. Concerning airfare, flights to Frankfurt are usually somewhat more expensive than to Stuttgart.</p> +<p>When booking flights to Frankfurt, you should be aware of Frankfurt-Hahn Airport (HHN), which serves low-cost carriers but is not connected to Frankfurt Airport. Frankfurt Hahn is approximately one hour from the Frankfurt main airport by car.</p> +<p>The journey from Frankfurt Airport to SAP headquarters takes about one hour by car (95 kilometers, or 59 miles).</p> +<p>Journey time from Stuttgart-Echterdingen Airport to SAP headquarters takes about 1 hour and 15 minutes by car (115 kilometers, or 71 miles).</p> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<p><strong>Traveling from Frankfurt Airport (FRA) to SAP Headquarters:</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A3/Würzburg/A5/Kassel/Basel/Frankfurt.&rdquo;</li> +<li>Follow the A5 to &ldquo;Basel/Karlsruhe/Heidelberg.&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<p>(Should you use a navigational system which does not recognize the street name &lsquo;Dietmar-Hopp-Allee&rsquo; please use &lsquo;Neurottstrasse&rsquo; instead.)</p> +<p><strong>Traveling from Stuttgart-Echterdingen Airport (STR) to SAP Headquarters:</strong></p> +<p>To get to SAP headquarters by car, there are two possible routes to take. The first leads you via Heilbronn and the second via Karlsruhe. The route via Karlsruhe is a bit shorter yet may be more congested.</p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>When leaving the airport, follow the highway symbol onto &ldquo;A8/Stuttgart/B27.&rdquo;</li> +<li>Stay on A8 and follow the sign for &ldquo;Karlsruhe/Heilbronn/Singen/A8.&rdquo;</li> +<li>Follow A8 to Karlsruhe.</li> +<li>Take exit 41 &ndash; &ldquo;Dreieck Karlsruhe&rdquo; to merge onto A5 toward &ldquo;Frankfurt/Mannheim/Karlsruhe/Landau (Pfalz).&rdquo;</li> +<li>Take exit 39 &ndash; &ldquo;Walldorf/Wiesloch.&rdquo;</li> +<li>Turn left onto B291.</li> +<li>Turn right onto Dietmar-Hopp-Allee.</li> +</ul> +<h6 id="parking"><strong>Parking</strong></h6> +<p>The closest parking lot to the event location is P7 (see figure above).</p> +<h5 id="by-train"><strong>By Train</strong></h5> +<p>As the infrastructure is very well developed in Europe, and in Germany in particular, taking the train is a great and easy way of traveling. Furthermore, the trains usually run on time, so this mode of travel is very convenient, especially for a group of people on longer journeys to major cities.</p> +<p><strong>From Frankfurt Airport (FRA) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to Terminal 1, level T (see overview in Appendix).</li> +<li>Go to the AIRail Terminal &ndash; &ldquo;Fernbahnhof&rdquo; (long-distance trains).</li> +<li>Choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP.&rdquo; It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> +<p><strong>From Stuttgart-Echterdingen Airport (STR) to SAP Headquarters</strong></p> +<p>Directions to SAP headquarters:</p> +<ul> +<li>Go to the S-Bahn station in the airport, following the sign (station is called &ldquo;Stuttgart Flughafen/Messe&rdquo;).</li> +<li>Take train number S2 or S3 to &ldquo;Stuttgart Hauptbahnhof&rdquo; (main station).</li> +<li>From Stuttgart Hauptbahnhof choose a connection with the destination train station &ldquo;Wiesloch&ndash;Walldorf&rdquo;.</li> +<li>From station &ldquo;Wiesloch&ndash;Walldorf,&rdquo; take bus number 707 or 721 toward &ldquo;Industriegebiet Walldorf, SAP&rdquo;. It is a 10-minute ride to reach bus stop &lsquo;SAP headquarters&rsquo;.</li> +</ul> + + + + + LDBC Is Proud to Announce the New LDBC Graphalytics Benchmark Draft Specification + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + Tue, 06 Sep 2016 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-is-proud-to-announce-the-new-ldbc-graphalytics-benchmark-draft-specification/ + <p>LDBC is proud to announce the new LDBC Graphalytics Benchmark draft specification.</p> +<p>LDBC Graphalytics is the first industry-grade graph data management benchmark for graph analysis platforms such as Giraph. It consists of six core algorithms, standard datasets, synthetic dataset generators, and reference outputs, enabling the objective comparison of graph analysis platforms. It has strong industry support from Oracle, Intel, Huawei and IBM, and was tested and optimized on the best industrial and open-source systems.</p> +<p>Tim Hegeman of <a href="https://www.tudelft.nl">TU Delft</a> is today presenting the technical paper describing LDBC Graphalytics at the important <a href="https://www.vldb.org/conference.html">VLDB</a> (Very Large DataBases) conference in New Delhi, where his talk also marks the release by LDBC of Graphalytics as a benchmark draft. Practitioners are invited to read the PVLDB paper, download the software and try running it.</p> +<p>LDBC is eager to use any feedback for its future adoption of LDBC Graphalytics.</p> +<p>Learn more: [/ldbc-graphalytics](LDBC Graphalytics)</p> +<p>GitHub: <a href="https://github.com/tudelft-atlarge/graphalytics">https://github.com/tudelft-atlarge/graphalytics</a></p> + + + + + Eighth TUC Meeting + https://ldbcouncil.org/event/eighth-tuc-meeting/ + Wed, 22 Jun 2016 14:45:20 -0400 + + https://ldbcouncil.org/event/eighth-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Eighth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event/eighth-tuc-meeting/attachments at <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a> in Redwood Shores facility on <strong>Wednesday and Thursday June 22-23, 2016</strong>.</p> +<p>This will be the second TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event/eighth-tuc-meeting/attachments will basically set the following aspects:</p> +<ul> +<li>Two day event/eighth-tuc-meeting/attachments with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify Oracle security in advance, registration requests need to be in by <strong>June 12</strong>.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also event/eighth-tuc-meeting/attachmentsually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +<li><a href="#accommodation">Accommodation</a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>On Wednesday, lunch is provided for all attendees at 12 pm. The TUC Meeting will start at 1pm.</p> +<h6 id="wednesday-22th-of-june-2016-room-203"><strong>Wednesday, 22th of June 2016 (<strong>Room 203)</strong></strong></h6> +<p>(full morning: LDBC Board of Directors meeting)</p> +<ul> +<li>12:00 - 13:00 Lunch (provided)</li> +<li>13:00 - 13:30 Hassan Chafi (Oracle) and Josep L. Larriba-Pey (Sparsity) Registration and welcome.</li> +<li>13:30 - 14:00 Peter Boncz (CWI) <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133891.pdf">LDBC introduction and status update</a>.</li> +<li>14:00 - 15:00 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</li> +<li>14:00 Arnau Prat (DAMA-UPC). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133902.pdf">Social Network Benchmark, Interactive workload</a>.</li> +<li>14:30 Tim Hegeman (TU Delft). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133893.pdf">Social Network Benchmark, Analytics workload</a>.</li> +<li>15:00 - 15:30 Coffee break</li> +<li>15:30 - 17:00 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>15:30 Martin Zand (University of Rochester Clinical and Translational Science Institute). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133897.pdf">Graphing Healthcare Networks: Data, Analytics, and Use Cases.</a></li> +<li>16:00 David Meibusch, Nathan Hawes (Oracle Labs Australia). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133901.pdf">Frappé: Querying and managing evolving code dependency graphs</a>.</li> +<li>16:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133895.pdf">UniProt: challenges of a public SPARQL endpoint.</a></li> +</ul> +</li> +<li>17:00 - 18:30 Graph Technologies (chair Peter Boncz) +<ul> +<li>17:00 Eugene I. Chong (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133904.pdf">Balancing Act to improve RDF Query Performance in Oracle Database</a>.</li> +<li>17:30 Lijun Chang (University of New South Wales). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133906.pdf">Efficient Subgraph Matching by Postponing Cartesian Products</a>.</li> +<li>18:00 Weining Qian (East China Normal University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133908.pdf">On Statistical Characteristics of Real-Life Knowledge Graphs</a>.</li> +</ul> +</li> +</ul> +<h6 id="thursday-23th-of-june-2016-room-203"><strong>Thursday, 23th of June 2016 (Room 203)</strong></h6> +<ul> +<li>08:00 - 09:00 Breakfast (provided)</li> +<li>09:00 - 10:00 Details on the progress of LDBC Task Forces 2 (chair Josep L. Larriba-Pey) +<ul> +<li>09:00 Peter Boncz (CWI). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52133896.pdf">Query Language Task Force status</a></li> +<li>09:45 Marcus Paradies (SAP). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297729.pdf">Social Network Benchmark, Business Intelligence workload</a></li> +</ul> +</li> +<li>10:00 - 12:00 Graph Technologies and Benchmarking (chair Oskar van Rest) +<ul> +<li>10:00 Sergey Edunov (Facebook). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297731.pdf">Generating realistic trillion-edge graphs</a></li> +<li>10:30 George Fletcher (TU Eindhoven). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297733.pdf">An open source framework for schema-driven graph instance and graph query workload generation</a>.</li> +<li>11:00 Yinglong Xia (Huawei Research America): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297735.pdf">An Efficient Big Graph Analytics Platform</a>.</li> +<li>11:30 Zhe Wu (Oracle USA). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297737.pdf">Bridging RDF Graph and Property Graph Data Models</a></li> +</ul> +</li> +<li>12:00 - 13:30 Lunch (provided)</li> +<li>13:30 - 15:30 Graph Technologies (chair Arnau Prat) +<ul> +<li>13:30 Tobias Lindaaker (Neo Technology). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297740.pdf">An open standard for graph queries: the Cypher contribution</a></li> +<li>14:00 Arash Termehchy (Oregon State University). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297742.pdf">Toward Representation Independent Graph Querying &amp; Analytics</a></li> +<li>14:30 Jerven Bolleman (SIB Swiss Institute of Bioinformatics/UniProt consortium). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297745.pdf">In the service of the federation</a></li> +<li>15:00 Nandish Jayaram (Pivotal). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52297747.pdf">Orion: Enabling Suggestions in a Visual Query Builder for Ultra-Heterogeneous Graphs</a>.</li> +</ul> +</li> +<li>15:30 - 16:00 Coffee break</li> +<li>16:00 - 17:15 Applications and use of Graph Technologies (chair Hassan Chafi) +<ul> +<li>16:00 Jans Aasman (Franz Inc.). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428806.pdf">Semantic Data Lake for Healthcare</a></li> +<li>16:15 Kevin Madden (Tom Sawyer Software). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428812.pdf">Dismantling Criminal Networks with Graph and Spatial Visualization and Analysis</a></li> +<li>16:45 Juan Sequeda (Capsenta). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428810.pdf">Using graph representation and semantic technology to virtually integrate and search multiple diverse data sources</a></li> +<li>17:15 Kevin Wilkinson (Hewlett Packard Labs). <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/52428808.pdf">LDBC SNB extensions</a></li> +</ul> +</li> +<li>17:45 - 18:15 Closing discussion</li> +</ul> +<h6 id="friday-24th-of-june-2016-room-105"><strong>Friday, 24th of June 2016 (Room 105)</strong></h6> +<p>At the same venue: the fourth international workshop on Graph Data Management, Experience and Systems (<strong>GRADES16</strong>).</p> +<p>18:30 social dinner for GRADES registrants (place to be announced)</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>22nd and 23rd June 2016</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the <a href="http://www.oracle.com/technetwork/database/rdb/hqcc-dir-134199.pdf">Oracle Conference Center</a></p> +<p>The address is:</p> +<p><strong>Room 203 (Wed-Thu) &amp; Room 105 (Fri)</strong><br> +<strong>Oracle Conference Center</strong><br> +<strong>350 Oracle Parkway</strong><br> +<strong>Redwood City, CA 94065, USA</strong></p> +<p><strong>Maps and situation</strong></p> +<p><a href="https://www.google.com/maps/place/Oracle+Conference+Center/@37.5322827,-122.2667034,17z/data=!3m1!4b1!4m2!3m1!1s0x808f98b5450e8ca3:0xdc75e8b1c02bbb91">Google Maps link</a></p> +<p>Oracle Campus map:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/eighth-tuc-meeting/attachments/40927235/40927234.jpg" alt=""></p> +<h5 id="getting-there"><strong>Getting there</strong></h5> +<h6 id="driving-directions"><strong>Driving directions</strong></h6> +<ul> +<li>[Southbound] <strong>-</strong> Take Highway 101 South (toward San Jose) to the Ralston Ave./Marine World Parkway exit. Take Marine World Parkway east which will loop you back over the freeway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +<li>[Northbound] <strong>-</strong> Take Highway 101 North (toward San Francisco) to the Ralston Ave./Marine World Parkway exit. Take the first exit ramp onto Marine World Parkway. Make a left at the first light onto Oracle Parkway. 350 Oracle Parkway will be on the right.</li> +</ul> +<h5 id="parking"><strong>Parking</strong></h5> +<p>The Conference Center has a designated parking lot located directly across from the building. If the lot is filled there is also additional parking in any of the parking garages located near by. No parking permits are needed.</p> +<h5 id="public-transport"><strong>Public transport</strong></h5> +<p>Take the Caltrain to either San Carlos or Hillsdale and take the free Oracle shuttle from there. Get off the Oracle shuttle at 100 Oracle Parkway (second stop) and walk 5 minutes to get to the Conference Center.</p> +<ul> +<li>Caltrain timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/weekdaytimetable.html</a></li> +<li>Oracle Shuttle timetables: <a href="http://www.caltrain.com/schedules/weekdaytimetable.html">http://www.caltrain.com/schedules/Shuttles/Oracle_Shuttle.html</a></li> +</ul> +<p>You can also take the Caltrain to Belmont and walk 23 min, instead of taking the Oracle shuttle.</p> +<p>Alternatively, SamTrans (San Mateo County&rsquo;s Transit Agency) provides public bus service between the Millbrae BART station and Palo Alto with three stops on Oracle Parkway - one of which is directly in front of the Oracle Conference Center.</p> + + + + + LDBC and Apache Flink + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + Mon, 16 Nov 2015 14:47:00 +0000 + + https://ldbcouncil.org/post/ldbc-and-apache-flink/ + <p>Apache Flink <a href="#references">[1]</a> is an open source platform for distributed stream and batch data processing. Flink&rsquo;s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.</p> +<p><img src="https://flink.apache.org/img/flink-stack-small.png" alt=""></p> +<p>Flink offers multiple APIs to process data from various data sources (e.g. HDFS, HBase, Kafka and JDBC). The DataStream and DataSet APIs allow the user to apply general-purpose data operations, like map, reduce, groupBy and join, on streams and static data respectively. In addition, Flink provides libraries for machine learning (Flink ML), graph processing (Gelly) and SQL-like operations (Table). All APIs can be used together in a single Flink program which enables the definition of powerful analytical workflows and the implementation of distributed algorithms.</p> +<p>The following snippet shows how a wordcount program can be expressed in Flink using the DataSet API:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">&gt;</span> text <span style="color:#f92672">=</span> env<span style="color:#f92672">.</span><span style="color:#a6e22e">fromElements</span><span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the past controls the future.&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;He who controls the present controls the past.&#34;</span><span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>Tuple2<span style="color:#f92672">&lt;</span>String<span style="color:#f92672">,</span> Integer<span style="color:#f92672">&gt;&gt;</span> wordCounts <span style="color:#f92672">=</span> text +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">flatMap</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> LineSplitter<span style="color:#f92672">())</span> <span style="color:#75715e">// splits the line and outputs (word,1) +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>tuples<span style="color:#f92672">.</span><span style="color:#a6e22e">groupBy</span><span style="color:#f92672">(</span><span style="color:#ae81ff">0</span><span style="color:#f92672">)</span> <span style="color:#75715e">// group by word +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> <span style="color:#f92672">.</span><span style="color:#a6e22e">sum</span><span style="color:#f92672">(</span><span style="color:#ae81ff">1</span><span style="color:#f92672">);</span> <span style="color:#75715e">// sum the 1&#39;s +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> +</span></span><span style="display:flex;"><span>wordCounts<span style="color:#f92672">.</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>At the Leipzig University, we use Apache Flink as execution layer for our graph analytics platform Gradoop <a href="#references">[2]</a>. The LDBC datagen helps us to evaluate the scalability of our algorithms and operators in a distributed execution environment. To use the generated graph data in Flink, we wrote a tool that transforms the LDBC output files into Flink data sets for further processing <a href="#references">[3]</a>. Using the class <code>LDBCToFlink</code>, LDBC output files can be read directly from HDFS or from the local file system:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;hdfs:///ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> <span style="color:#75715e">// or &#34;/path/to/social_network&#34; +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> vertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> edges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The tuple classes <code>LDBCVertex</code> and <code>LDBCEdge</code> hold the information generated by the LDBC datagen and are created directly from its output files. During the transformation process, globally unique vertex identifiers are created based on the LDBC identifier and the vertex class. When reading edge files, source and target vertex identifiers are computed in the same way to ensure consistent linking between vertices.</p> +<p>Each <code>LDBCVertex</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all vertices * a vertex label (e.g. <code>Person</code>, <code>Comment</code>) * a key-value map of properties including also multivalued properties<br> +(e.g. <code>Person.email</code>)</li> +</ul> +<p>Each <code>LDBCEdge</code> instance contains:</p> +<ul> +<li>an identifier, which is unique among all edges</li> +<li>an edge label (e.g. <code>knows</code>, <code>likes</code>)</li> +<li>a source vertex identifier</li> +<li>a target vertex identifier</li> +<li>a key-value map of properties</li> +</ul> +<p>The resulting datasets can be used by the DataSet API and all libraries that are built on top of it (i.e. Flink ML, Gelly and Table). In the following example, we load the LDBC graph from HDFS, filter vertices with the label <code>Person</code> and edges with the label <code>knows</code> and use Gelly to compute the connected components of that subgraph. The full source code is available on GitHub <a href="#references">[4]</a>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-java" data-lang="java"><span style="display:flex;"><span><span style="color:#66d9ef">final</span> ExecutionEnvironment env <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> ExecutionEnvironment<span style="color:#f92672">.</span><span style="color:#a6e22e">getExecutionEnvironment</span><span style="color:#f92672">();</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">final</span> LDBCToFlink ldbcToFlink <span style="color:#f92672">=</span> <span style="color:#66d9ef">new</span> LDBCToFlink<span style="color:#f92672">(</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;/home/s1ck/Devel/Java/ldbc_snb_datagen/social_network&#34;</span><span style="color:#f92672">,</span> +</span></span><span style="display:flex;"><span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter vertices with label “Person” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCVertex<span style="color:#f92672">&gt;</span> ldbcVertices <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getVertices</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">VERTEX_CLASS_PERSON</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// filter edges with label “knows” +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>LDBCEdge<span style="color:#f92672">&gt;</span> ldbcEdges <span style="color:#f92672">=</span> ldbcToFlink<span style="color:#f92672">.</span><span style="color:#a6e22e">getEdges</span><span style="color:#f92672">()</span> +</span></span><span style="display:flex;"><span> <span style="color:#f92672">.</span><span style="color:#a6e22e">filter</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeLabelFilter<span style="color:#f92672">(</span>LDBCConstants<span style="color:#f92672">.</span><span style="color:#a6e22e">EDGE_CLASS_KNOWS</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly vertices suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> vertices <span style="color:#f92672">=</span> ldbcVertices<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> VertexInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly edges suitable for connected components +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Edge<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;&gt;</span> edges <span style="color:#f92672">=</span> ldbcEdges<span style="color:#f92672">.</span><span style="color:#a6e22e">map</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> EdgeInitializer<span style="color:#f92672">());</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// create Gelly graph +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>Graph<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;</span> g <span style="color:#f92672">=</span> Graph<span style="color:#f92672">.</span><span style="color:#a6e22e">fromDataSet</span><span style="color:#f92672">(</span>vertices<span style="color:#f92672">,</span> edges<span style="color:#f92672">,</span> env<span style="color:#f92672">);</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// run connected components on the subgraph for 10 iterations +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>DataSet<span style="color:#f92672">&lt;</span>Vertex<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> Long<span style="color:#f92672">&gt;&gt;</span> components <span style="color:#f92672">=</span> +</span></span><span style="display:flex;"><span> g<span style="color:#f92672">.</span><span style="color:#a6e22e">run</span><span style="color:#f92672">(</span><span style="color:#66d9ef">new</span> ConnectedComponents<span style="color:#f92672">&lt;</span>Long<span style="color:#f92672">,</span> NullValue<span style="color:#f92672">&gt;(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">));</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">// print the component id of the first 10 vertices +</span></span></span><span style="display:flex;"><span><span style="color:#75715e"></span>components<span style="color:#f92672">.</span><span style="color:#a6e22e">first</span><span style="color:#f92672">(</span><span style="color:#ae81ff">10</span><span style="color:#f92672">).</span><span style="color:#a6e22e">print</span><span style="color:#f92672">();</span> +</span></span></code></pre></div><p>The ldbc-flink-import tool is available on Github <a href="#references">[3]</a> and licensed under the GNU GPLv3. If you have any questions regarding the tool please feel free to contact me on GitHub. If you find bugs or have any ideas for improvements, please create an issue or a pull request.</p> +<p>If you want to learn more about Apache Flink, a good starting point is the main documentation <a href="#references">[5]</a> and if you have any question feel free to ask the official mailing lists.<br> +There is also a nice set of videos <a href="#references">[6]</a> available from the latest Flink Forward conference.</p> +<h4 id="references">References</h4> +<p>[1] <a href="http://flink.apache.org/">http://flink.apache.org/</a></p> +<p>[2] <a href="https://github.com/dbs-leipzig/gradoop">https://github.com/dbs-leipzig/gradoop</a></p> +<p>[3] <a href="https://github.com/s1ck/ldbc-flink-import">https://github.com/s1ck/ldbc-flink-import</a></p> +<p>[4] <a href="https://gist.github.com/s1ck/b33e6a4874c15c35cd16">https://gist.github.com/s1ck/b33e6a4874c15c35cd16</a></p> +<p>[5] <a href="https://ci.apache.org/projects/flink/flink-docs-release-0.10/">https://ci.apache.org/projects/flink/flink-docs-release-0.10/</a></p> +<p>[6] <a href="https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA">https://www.youtube.com/channel/UCY8_lgiZLZErZPF47a2hXMA</a></p> + + + + + Seventh TUC Meeting + https://ldbcouncil.org/event/seventh-tuc-meeting/ + Mon, 09 Nov 2015 14:17:30 -0400 + + https://ldbcouncil.org/event/seventh-tuc-meeting/ + <p>The LDBC consortium is pleased to announce its Seventh Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at <a href="http://www.research.ibm.com/labs/watson">IBM&rsquo;s TJ Watson</a> facility on <strong>Monday and Tuesday November 9/10, 2015.</strong></p> +<p>This will be the first TUC meeting after the finalisation of the LDBC FP7 EC funded project. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the LDBC organisation officials.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact Damaris Coll (UPC) at <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>; in order to notify IBM security in advance, registration requests need to be in by Nov 1.</p> +<p>In the agenda, there will be talks given by LDBC members and LDBC activities, but there will also be room for a number of short 20-minute talks by other participants. We are specifically interested in learning about new challenges in graph data management (where benchmarking would become useful) and on hearing about actual user stories and scenarios that could inspire benchmarks. Further, talks that provide feedback on existing benchmark (proposals) are very relevant. But nothing is excluded a priori if it is graph data management related. Talk proposals can be forwarded to Damaris as well and will be handled by Peter Boncz and Larri.</p> +<p>Further, we call on you if you or your colleagues would happen to have contacts with companies that deal with graph data management scenarios to also attend and possibly present. LDBC is always looking to expand its circle of participants in TUCs meeting, its graph technology users contacts but also eventually its membership base.</p> +<p>In this page, you&rsquo;ll find information about the following items:</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a><br> +- <a href="#date"><strong>Date</strong></a><br> +- <a href="#venue"><strong>Venue</strong></a><br> +- <a href="#maps-and-situation"><strong>Maps and situation</strong></a><br> +- <a href="#getting-there"><strong>Getting there</strong></a></li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>Monday, 9th of November 2015</strong></p> +<p>8:45 - 9:15 Registration and welcome (Yinglong Xia and Josep L. Larriba Pey)</p> +<p>9:15 - 9:30 LDBC introduction and status update (Josep L. Larriba-Pey)</p> +<p>9:30 - 10:30 Details on the progress of LDBC Task Forces 1 (chair Josep L. Larriba-Pey)</p> +<p>9:30 Arnau Prat (DAMA-UPC). Social Network Benchmark, Interactive workload</p> +<p>10:00 Orri Erling (OpenLink Software). Social Network Benchmark, Business Intelligence workload</p> +<p>10:30-11:00 Coffee break</p> +<p>11:00 - 12:30 Details on the progress of LDBC Task Forces 2 (chair Yinglong Xia)</p> +<p>11:00 Alexandru Iosup (TU Delft). Social Network Benchmark, Analytics workload.</p> +<p>11:30 Claudio Gutierrez (U Chile). Query Language Task Force status.</p> +<p>12:00 Atanas Kiryakov (Ontotext). Semantic Publishing Benchmark status</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 - 16:00 Technologies and benchmarking (chair Hassan Chafi)</p> +<p>14:00 Molham Aref (LogicBlox). Graph Data Management with LogicBlox</p> +<p>14:30 Peter Kogge (Notre Dame). BFS as in Graph500 on today&rsquo;s architectures</p> +<p>15:00 Ching-Yung Lin (IBM). Status and Demo of IBM System G</p> +<p>15:30-16:00 Coffee break</p> +<p>16:00 - 17:00 Technologies (chair Irini Fundulaki)</p> +<p>16:00 Kavitha Srinivas (IBM). SQLGraph: An efficient relational based property graph store</p> +<p>16:30 David Ediger (GeorgiaTech). STINGER</p> +<p>17:00 Gary King (Franz Inc.). AllegroGraph&rsquo;s SPARQL implementation with Social Network Analytics abilities using Magic Properties</p> +<p>17:30 Manoj Kumar (IBM). Linear Algebra Formulation for Large Graph Analytics</p> +<p>18:00 Reihaneh Amini (Wright State University) Linked Data in the GeoLink Usecase</p> +<p>19:00 Social dinner</p> +<p><strong>Tuesday 10th November 2015</strong></p> +<p>9:00 - 10:30 Technology, Applications and Benchmarking (chair Alexandru Iosup)</p> +<p>9:00 Philip Rathle (Neo). On openCypher</p> +<p>9:20 Morteza Shahriari (University of Florida). Multi-modal Probabilistic Knowledge Base for Remote Sensing Species Identification</p> +<p>9:50 Peter Kogge (Notre Dame). Challenging problems with Lexis Nexis Risk Solutions</p> +<p>10:10 Arnau Prat (DAMA-UPC). DATAGEN, status and perspectives for synthetic data generation</p> +<p>10:30 - 11:00 Coffee break</p> +<p>11:00 - 12:45 Applications and use of Graph Technologies (chair Atanas Kiryakov)</p> +<p>11:00 Hassan Chafi (Oracle). Status and characteristics of PGQL</p> +<p>11:20 David Guedalia (TAGIIO). Multi-tier distributed mobile applications and how they split their workload,</p> +<p>11:40 Guojing Cong (IBM). Algorithmic technique and architectural support for fast graph analysis</p> +<p>12:00 Josep Lluis Larriba-Pey. Conclusions for the TUC meeting and future perspectives</p> +<p>12:30 - 14:00 Lunch break</p> +<p>14:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>9th and 10th November 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held in the IBM Thomas J Watson Research Center.<br> +The address is:</p> +<p><strong>IBM Thomas J Watson Research Center</strong><br> +<strong>1101 Kitchawan Rd,</strong><br> +<strong>Yorktown Heights, NY 10598, USA</strong></p> +<p>If you are using a <em>GPS system</em>, please enter <strong>&ldquo;200 Aqueduct Road, Ossining NY, 10562&rdquo;</strong> for accurate directions to the lab entrance. You may also want to check the routing online.</p> +<p>The meeting will take place in the <em>Auditorium</em> on November 9th, and in Meeting Room <em>20-043</em> on November 10th.</p> +<h6 id="maps-and-situation"><strong>Maps and situation</strong></h6> +<p>You are highly suggested to <strong>rent a car</strong> for your convenience, since the public transportation system does not cover this area very well. Besides, there is no hotel within walkable distance to the IBM T.J. Watson Research Center. Feel free to find carpool with other attendees. You may find car rental and hotels through <a href="http://www.orbitz.com">www.orbitz.com</a>, or <a href="http://www.expedia.com">www.expedia.com</a> Feel free to email <a href="mailto:yxia@us.ibm.com">yxia@us.ibm.com</a> for any questions.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/seventh-tuc-meeting/attachments/6882333/15926330.png" alt=""></p> +<h6 id="getting-there"><strong>Getting there</strong></h6> +<p><strong>Upper and Eastern New England</strong></p> +<p>Route I-84 west to Route I-684, south to Exit 6, west on Route 35 to Route 100, south to Route 134, west 2.5 miles. IBM is on the left.</p> +<p><strong>New Haven and Connecticut Shores</strong></p> +<p>Merritt Parkway or New England Thruway (Route I-95) west to Route I-287, west to Exit 3, north on Sprain Brook Parkway, which merges into Taconic State Parkway, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New Jersey</strong></p> +<p>Take New York State Thruway (Route I-87) east across the Tappan Zee Bridge and follow signs to the Saw Mill Parkway north. Proceed north on Saw Mill River Parkway to Taconic State Parkway exit, north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>Upstate New York</strong></p> +<p>Route I-84 east across Newburgh-Beacon Bridge to Exit 16-S. Taconic State Parkway south to Route 134 East exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>New York City (Manhattan)</strong></p> +<p>Henry Hudson Parkway north, which becomes Saw Mill River Parkway, north to Taconic State Parkway exit. North on Taconic State Parkway to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>John F. Kennedy International Airport</strong></p> +<p>North on Van Wyck Expressway to the Whitestone Expressway and continue north across the Bronx-Whitestone Bridge to the Hutchinson River Parkway north to the Cross County Parkway exit and proceed west to the Bronx River Parkway. North on the Bronx River Parkway to the Sprain Brook Parkway, which merges into the Taconic State Parkway. Continue north to Ossining/Route 134 exit. Turn right and proceed east on Route 134 several hundred yards. IBM is on the right.</p> +<p><strong>LaGuardia Airport</strong></p> +<p>East on the Grand Central Parkway, north on the Whitestone Expressway, and continue north across the Bronx-Whitestone Bridge. Continue with instructions from John F. Kennedy International Airport, above.</p> +<p><strong>Newark International Airport</strong></p> +<p>North on the New Jersey Turnpike (Route I-95). Stay in local lanes and take Exit 72 for Palisades Interstate Parkway. North on the Palisades Interstate Parkway to the New York State Thruway, Route I-87, and east across the Tappan Zee Bridge. Continue with instructions from New Jersey, above.</p> +<p><strong>Stewart International Airport</strong></p> +<p>Route 207 east to Route I-84, east across Newburgh-Beacon Bridge to Taconic State Parkway, south. Continue with instructions from Upstate New York, above.</p> +<p><strong>Westchester County Airport</strong></p> +<p>Right on Route 120, north. Turn left where Route 120 merges with Route 133. Continue on Route 120. Cross Route 100 and continue straight on Shingle House Road to Pines Bridge Road. Turn right and proceed several hundred yards. IBM is on the left.</p> +<p><strong>Public Transportation</strong></p> +<p>Metropolitan Transportation Authority (MTA) train stations nearest to the Yorktown Heights location are the Croton-Harmon and White Plains stations. Taxi service is available at both locations.</p> + + + + + Elements of Instance Matching Benchmarks: a Short Overview + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + Tue, 16 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/elements-of-instance-matching-benchmarks-a-short-overview/ + <p>The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not explicitly stated but is discovered using <strong>instance matching</strong> techniques and tools. Instance matching is also known as <strong>record linkage</strong> <a href="#references">[1]</a>, <strong>duplicate detection</strong> <a href="#references">[2]</a>, <strong>entity resolution</strong> <a href="#references">[3]</a> and <strong>object identification</strong> <a href="#references">[4]</a>.</p> +<p>For instance, a search in Geonames (<a href="http://www.geonames.org/">http://www.geonames.org/</a>) for &ldquo;Athens&rdquo; would return a resource (i.e., URI) accompanied with a map of the area and information about the place; additional information for the city of Athens can be found in other datasets such as for instance DBpedia (<a href="http://dbpedia.org/">http://dbpedia.org/</a>) or Open Government Datasets (<a href="http://data.gov.gr/">http://data.gov.gr/</a>). To exploit all obtain all necessary information about the city of Athens we need to establish that the retrieved resources refer to the same real world object.</p> +<p>Web resources are published by &ldquo;autonomous agents&rdquo; who choose their preferred information representation or the one that best fits the application of interest. Furthermore, different representations of the same real world entity are due to data acquisition errors or different acquisition techniques used to process scientific data. Moreover, real world entities evolve and change over time, and sources need to keep track of these developments, a task that is very hard and often not possible. Finally, when integrating data from multiple sources, the process itself may add new erroneous data. Clearly, these reasons are not limited to problems that did arise in the era of Web Data, it is thus not surprising that instance matching systems have been around for several years <a href="#references">[2]</a><a href="#references">[5]</a>.</p> +<p>It is though essential at this point to develop, along with instance and entity matching systems, <em>instance matching benchmarks to determine the weak and strong points of those systems, as well as their overall quality in order to support users in deciding the system to use for their needs</em>. Hence, well defined, and good quality benchmarks are important for comparing the performance of the available or under development instance matching systems. Benchmarks are used not only to inform users of the strengths and weaknesses of systems, but also to motivate developers, researchers and technology vendors to deal with the weak points of their systems and to ameliorate their performance and functionality. They are also useful for identifying the settings in which each of the systems has optimal performance. Benchmarking aims at providing an objective basis for such assessments.</p> +<p>An instance matching benchmark for Linked Data consists of a <em>source</em> and <em>target dataset</em> implementing a set of <em>test-cases</em>, where each test case addresses a different kind of requirement regarding instance matching, a <em>ground truth</em> or <em>gold standard</em> and finally the <em>evaluation metrics</em> used to <em>assess the benchmark.</em></p> +<p>Datasets are the raw material of a benchmark. A benchmark comprises of a <em>source</em> and <em>target</em> dataset and the objective of an instance matching system is to discover the matches of the two. Datasets are characterized by (a) their <em>nature</em> (<em>real</em> or <em>synthetic</em>), (b) the <em>schemas/ontologies</em> they use, (c) their <em>domains</em>, (d) the <em>languages</em> they are written in, and (e) the <em>variations/heterogeneities</em> of the datasets. Real datasets are widely used in benchmarks since they offer realistic conditions for heterogeneity problems and they have realistic distributions. <em>Synthetic datasets</em> are generated using automated data generators and are useful because they offer fully controlled test conditions, have accurate gold standards and allow setting the focus on specific types of heterogeneity problems in a systematic manner</p> +<p>Datasets (and benchmarks) may contain different <em>kinds of variations</em> that correspond to <em>different test cases</em>. According to Ferrara et.al. <a href="#references">[6]</a><a href="#references">[7]</a>, three kinds of variations exist for Linked Data, namely <em>data variations</em>, <em>structural variations</em> and <em>logical variations</em>. The first refers mainly to differences due to typographical errors, differences in the employed data formats, language etc. The second refers to the differences in the structure of the employed Linked Data schemas. Finally, the third type derives from the use of semantically rich RDF and OWL constructs that enable one to define hierarchies and equivalence of classes and properties, (in)equality of instances, complex class definitions through union and intersection among others.</p> +<p>The common case in real benchmarks is that the datasets to be matched contain different kinds (combinations) of variations. On the other hand, synthetic datasets may be purposefully designed to contain specific types (or combinations) of variations (e.g., only structural), or may be more general in an effort to illustrate all the common cases of discrepancies that appear in reality between individual descriptions.</p> +<p>The <em>gold standard</em> is considered as the “correct answer sheet” of the benchmark, and is used to judge the completeness and soundness of the result sets of the benchmarked systems. For instance matching benchmarks employing synthetic datasets, the gold standard is always automatically generated, as the errors (variations) that are added into the datasets are known and systematically created. When it comes to real datasets, the gold standard can be either manually curated or (semi-) automatically generated. In the first case, domain experts manually mark the matches between the datasets, whereas in the second, supervised and crowdsourcing techniques aid the process of finding the matches, a process that is often time consuming and error prone.</p> +<p>Last, an instance matching benchmark uses <em>evaluation metrics</em> to determine and assess the systems’ output quality and performance. For instance matching tools, performance is not a critical aspect. On the other hand, an instance matching tool should return all and only the correct answers. So, what matters most is returning the relevant matches, rather than returning them quickly. For this reason, the evaluation metrics that are dominantly employed for instance matching benchmarks are the standard <em>precision</em>, <em>recall</em> and <em>f-measure</em> metrics.</p> +<h4 id="references">References</h4> +<p>[1] Li, C., Jin, L., and Mehrotra, S. (2006) Supporting efficient record linkage for large data sets using mapping techniques. WWW 2006.</p> +<p>[2] Dragisic, Z., Eckert, K., Euzenat, J., Faria, D., Ferrara, A., Granada, R., Ivanova, V., Jimenez-Ruiz, E., Oskar Kempf, A., Lambrix, P., Montanelli, S., Paulheim, H., Ritze, D., Shvaiko, P., Solimando, A., Trojahn, C., Zamaza, O., and Cuenca Grau, B. (2014) Results of the Ontology Alignment Evaluation Initiative 2014. Proc. 9th ISWC workshop on ontology matching (OM 2014).</p> +<p>[3] Bhattacharya, I. and Getoor, L. (2006) Entity resolution in graphs. Mining Graph Data. Wiley and Sons 2006.</p> +<p>[4] Noessner, J., Niepert, M., Meilicke, C., and Stuckenschmidt, H. (2010) Leveraging Terminological Structure for Object Reconciliation. In ESWC 2010.</p> +<p>[5] Flouris, G., Manakanatas, D., Kondylakis, H., Plexousakis, D., Antoniou, G. Ontology Change: Classification and Survey (2008) Knowledge Engineering Review (KER 2008), pages 117-152.</p> +<p>[6] Ferrara, A., Lorusso, D., Montanelli, S., and Varese, G. (2008) Towards a Benchmark for Instance Matching. Proc. 3th ISWC workshop on ontology matching (OM 2008).</p> +<p>[7] Ferrara, A., Montanelli, S., Noessner, J., and Stuckenschmidt, H. (2011) Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + SNB Interactive Part 3: Choke Points and Initial Run on Virtuoso + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + Wed, 10 Jun 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/ + <p>In this post we will look at running the <a href="https://ldbcouncil.org/developer/snb">LDBC SNB</a> on <a href="https://virtuoso.openlinksw.com/">Virtuoso</a>.</p> +<p>First, let&rsquo;s recap what the benchmark is about:</p> +<ol> +<li> +<p>fairly frequent short updates, with no update contention worth mentioning</p> +</li> +<li> +<p>short random lookups</p> +</li> +<li> +<p>medium complex queries centered around a person&rsquo;s social environment</p> +</li> +</ol> +<p>The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an online social application does lookups for the most part. The medium complex queries are to challenge the DBMS.</p> +<p>The DBMS challenges have to do firstly with query optimization, and secondly with execution with a lot of non-local random access patterns. Query optimization is not a requirement, <em>per se,</em> since imperative implementations are allowed, but we will see that these are no more free of the laws of nature than the declarative ones.</p> +<p>The workload is arbitrarily parallel, so intra-query parallelization is not particularly useful, if also not harmful. There are latency constraints on operations which strongly encourage implementations to stay within a predictable time envelope regardless of specific query parameters. The parameters are a combination of person and date range, and sometimes tags or countries. The hardest queries have the potential to access all content created by people within 2 steps of a central person, so possibly thousands of people, times 2000 posts per person, times up to 4 tags per post. We are talking in the millions of key lookups, aiming for sub-second single-threaded execution.</p> +<p>The test system is the same as used in the <a href="http://www.openlinksw.com/weblog/oerling/?id=1739">TPC-H series</a>: dual Xeon E5-2630, 2x6 cores x 2 threads, 2.3GHz, 192 GB RAM. The software is the <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics branch</a> of <a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack, available from www.github.com</a>.</p> +<p>The dataset is the SNB 300G set, with:</p> +<table> +<thead> +<tr> +<th>1,136,127</th> +<th>persons</th> +</tr> +</thead> +<tbody> +<tr> +<td>125,249,604</td> +<td>knows edges</td> +</tr> +<tr> +<td>847,886,644</td> +<td>posts, including replies</td> +</tr> +<tr> +<td>1,145,893,841</td> +<td>tags of posts or replies</td> +</tr> +<tr> +<td>1,140,226,235</td> +<td>likes of posts or replies</td> +</tr> +</tbody> +</table> +<p>As an initial step, we run the benchmark as fast as it will go. We use 32 threads on the driver side for 24 hardware threads.</p> +<p>Below are the numerical quantities for a 400K operation run after 150K operations worth of warmup.</p> +<p><strong>Duration:</strong> 10:41.251<br> +<strong>Throughput:</strong> 623.71 (op/s)</p> +<p>The statistics that matter are detailed below, with operations ranked in order of descending client-side wait-time. All times are in milliseconds.</p> +<table> +<thead> +<tr> +<th>% of total</th> +<th>total_wait</th> +<th>name</th> +<th>count</th> +<th>mean</th> +<th>min</th> +<th>max</th> +</tr> +</thead> +<tbody> +<tr> +<td>20%</td> +<td>4,231,130</td> +<td>LdbcQuery5</td> +<td>656</td> +<td>6,449.89</td> +<td>245</td> +<td>10,311</td> +</tr> +<tr> +<td>11%</td> +<td>2,272,954</td> +<td>LdbcQuery8</td> +<td>18,354</td> +<td>123.84</td> +<td>14</td> +<td>2,240</td> +</tr> +<tr> +<td>10%</td> +<td>2,200,718</td> +<td>LdbcQuery3</td> +<td>388</td> +<td>5,671.95</td> +<td>468</td> +<td>17,368</td> +</tr> +<tr> +<td>7.3%</td> +<td>1,561,382</td> +<td>LdbcQuery14</td> +<td>1,124</td> +<td>1,389.13</td> +<td>4</td> +<td>5,724</td> +</tr> +<tr> +<td>6.7%</td> +<td>1,441,575</td> +<td>LdbcQuery12</td> +<td>1,252</td> +<td>1,151.42</td> +<td>15</td> +<td>3,273</td> +</tr> +<tr> +<td>6.5%</td> +<td>1,396,932</td> +<td>LdbcQuery10</td> +<td>1,252</td> +<td>1,115.76</td> +<td>13</td> +<td>4,743</td> +</tr> +<tr> +<td>5%</td> +<td>1,064,457</td> +<td>LdbcShortQuery3PersonFriends</td> +<td>46,285</td> +<td>22.9979</td> +<td>0</td> +<td>2,287</td> +</tr> +<tr> +<td>4.9%</td> +<td>1,047,536</td> +<td>LdbcShortQuery2PersonPosts</td> +<td>46,285</td> +<td>22.6323</td> +<td>0</td> +<td>2,156</td> +</tr> +<tr> +<td>4.1%</td> +<td>885,102</td> +<td>LdbcQuery6</td> +<td>1,721</td> +<td>514.295</td> +<td>8</td> +<td>5,227</td> +</tr> +<tr> +<td>3.3%</td> +<td>707,901</td> +<td>LdbcQuery1</td> +<td>2,117</td> +<td>334.389</td> +<td>28</td> +<td>3,467</td> +</tr> +<tr> +<td>2.4%</td> +<td>521,738</td> +<td>LdbcQuery4</td> +<td>1,530</td> +<td>341.005</td> +<td>49</td> +<td>2,774</td> +</tr> +<tr> +<td>2.1%</td> +<td>440,197</td> +<td>LdbcShortQuery4MessageContent</td> +<td>46,302</td> +<td>9.50708</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.9%</td> +<td>407,450</td> +<td>LdbcUpdate5AddForumMembership</td> +<td>14,338</td> +<td>28.4175</td> +<td>0</td> +<td>2,008</td> +</tr> +<tr> +<td>1.9%</td> +<td>405,243</td> +<td>LdbcShortQuery7MessageReplies</td> +<td>46,302</td> +<td>8.75217</td> +<td>0</td> +<td>2,112</td> +</tr> +<tr> +<td>1.9%</td> +<td>404,002</td> +<td>LdbcShortQuery6MessageForum</td> +<td>46,302</td> +<td>8.72537</td> +<td>0</td> +<td>1,968</td> +</tr> +<tr> +<td>1.8%</td> +<td>387,044</td> +<td>LdbcUpdate3AddCommentLike</td> +<td>12,659</td> +<td>30.5746</td> +<td>0</td> +<td>2,060</td> +</tr> +<tr> +<td>1.7%</td> +<td>361,290</td> +<td>LdbcShortQuery1PersonProfile</td> +<td>46,285</td> +<td>7.80577</td> +<td>0</td> +<td>2,015</td> +</tr> +<tr> +<td>1.6%</td> +<td>334,409</td> +<td>LdbcShortQuery5MessageCreator</td> +<td>46,302</td> +<td>7.22234</td> +<td>0</td> +<td>2,055</td> +</tr> +<tr> +<td>1%</td> +<td>220,740</td> +<td>LdbcQuery2</td> +<td>1,488</td> +<td>148.347</td> +<td>2</td> +<td>2,504</td> +</tr> +<tr> +<td>0.96%</td> +<td>205,910</td> +<td>LdbcQuery7</td> +<td>1,721</td> +<td>119.646</td> +<td>11</td> +<td>2,295</td> +</tr> +<tr> +<td>0.93%</td> +<td>198,971</td> +<td>LdbcUpdate2AddPostLike</td> +<td>5,974</td> +<td>33.3062</td> +<td>0</td> +<td>1,987</td> +</tr> +<tr> +<td>0.88%</td> +<td>189,871</td> +<td>LdbcQuery11</td> +<td>2,294</td> +<td>82.7685</td> +<td>4</td> +<td>2,219</td> +</tr> +<tr> +<td>0.85%</td> +<td>182,964</td> +<td>LdbcQuery13</td> +<td>2,898</td> +<td>63.1346</td> +<td>1</td> +<td>2,201</td> +</tr> +<tr> +<td>0.74%</td> +<td>158,188</td> +<td>LdbcQuery9</td> +<td>78</td> +<td>2,028.05</td> +<td>1,108</td> +<td>4,183</td> +</tr> +<tr> +<td>0.67%</td> +<td>143,457</td> +<td>LdbcUpdate7AddComment</td> +<td>3,986</td> +<td>35.9902</td> +<td>1</td> +<td>1,912</td> +</tr> +<tr> +<td>0.26%</td> +<td>54,947</td> +<td>LdbcUpdate8AddFriendship</td> +<td>571</td> +<td>96.2294</td> +<td>1</td> +<td>988</td> +</tr> +<tr> +<td>0.2%</td> +<td>43,451</td> +<td>LdbcUpdate6AddPost</td> +<td>1,386</td> +<td>31.3499</td> +<td>1</td> +<td>2,060</td> +</tr> +<tr> +<td>0.01%</td> +<td>1,848</td> +<td>LdbcUpdate4AddForum</td> +<td>103</td> +<td>17.9417</td> +<td>1</td> +<td>65</td> +</tr> +<tr> +<td>0.00%</td> +<td>44</td> +<td>LdbcUpdate1AddPerson</td> +<td>2</td> +<td>22</td> +<td>10</td> +<td>34</td> +</tr> +</tbody> +</table> +<p>At this point we have in-depth knowledge of the choke points the benchmark stresses, and we can give a first assessment of whether the design meets its objectives for setting an agenda for the coming years of graph database development.</p> +<p>The implementation is well optimized in general but still has maybe 30% room for improvement. We note that this is based on a compressed column store. One could think that alternative data representations, like in-memory graphs of structs and pointers between them, are better for the task. This is not necessarily so; at the least, a compressed column store is much more space efficient. Space efficiency is the root of cost efficiency, since as soon as the working set is not in memory, a random access workload is badly hit.</p> +<p>The set of choke points (technical challenges) actually revealed by the benchmark is so far as follows:</p> +<ul> +<li> +<p><em>Cardinality estimation under heavy data skew —</em> Many queries take a tag or a country as a parameter. The cardinalities associated with tags vary from 29M posts for the most common to 1 for the least common. Q6 has a common tag (in top few hundred) half the time and a random, most often very infrequent, one the rest of the time. A declarative implementation must recognize the cardinality implications from the literal and plan accordingly. An imperative one would have to count. Missing this makes Q6 take about 40% of the time instead of 4.1% when adapting.</p> +</li> +<li> +<p><em>Covering indices —</em> Being able to make multi-column indices that duplicate some columns from the table often saves an entire table lookup. For example, an index onpost by author can also contain the post&rsquo;s creation date.</p> +</li> +<li> +<p><em>Multi-hop graph traversal —</em> Most queries access a two-hop environment starting at a person. Two queries look for shortest paths of unbounded length. For the two-hop case, it makes almost no difference whether this is done as a union or a special graph traversal operator. For shortest paths, this simply must be built into the engine; doing this client-side incurs prohibitive overheads. A bidirectional shortest path operation is a requirement for the benchmark.</p> +</li> +<li> +<p><em>Top <em>K</em> —</em> Most queries returning posts order results by descending date. Once there are at least <em>k</em> results, anything older than the __k__th can be dropped, adding a dateselection as early as possible in the query. This interacts with vectored execution, so that starting with a short vector size more rapidly produces an initial top <em>k</em>.</p> +</li> +<li> +<p><em>Late projection —</em> Many queries access several columns and touch millions of rows but only return a few. The columns that are not used in sorting or selection can be retrieved only for the rows that are actually returned. This is especially useful with a column store, as this removes many large columns (e.g., text of a post) from the working set.</p> +</li> +<li> +<p><em>Materialization —</em> Q14 accesses an expensive-to-compute edge weight, the number of post-reply pairs between two people. Keeping this precomputed drops Q14 from the top place. Other materialization would be possible, for example Q2 (top 20 posts by friends), but since Q2 is just 1% of the load, there is no need. One could of course argue that this should be 20x more frequent, in which case there could be a point to this.</p> +</li> +<li> +<p><em>Concurrency control —</em> Read-write contention is rare, as updates are randomly spread over the database. However, some pages get read very frequently, e.g., some middle level index pages in the post table. Keeping a count of reading threads requires a mutex, and there is significant contention on this. Since the hot set can be one page, adding more mutexes does not always help. However, hash partitioning the index into many independent trees (as in the case of a cluster) helps for this. There is also contention on a mutex for assigning threads to client requests, as there are large numbers of short operations.</p> +</li> +</ul> +<p>In subsequent posts, we will look at specific queries, what they in fact do, and what their theoretical performance limits would be. In this way we will have a precise understanding of which way SNB can steer the graph DB community.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + SNB and Graphs Related Presentations at GRADES '15 + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + Fri, 29 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-and-graphs-related-presentations-at-grades-15/ + <p>Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.</p> +<p>Among the papers published in this edition we have &ldquo;Graphalytics: A Big Data Benchmark for Graph-Processing Platforms&rdquo;, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can be found in <a href="https://github.com/ldbc">https://github.com/ldbc</a>) as the base to execute the algorithms used for the benchmark, among which we have BFS, community detection and connected components. We also have &ldquo;Microblogging Queries on Graph Databases: an Introspection&rdquo; which benchmarks two of the most significant Graph Databases in the market, i.e. Neo4j and Sparksee using microblogging queries on top of twitter data. We can finally mention &ldquo;Frappé: Querying the Linux Kernel Dependency Graph&rdquo; which presents a framework for querying and visualising the dependencies of large C/C++ software systems.</p> +<p><a href="http://event.cwi.nl/grades2015/program.shtml">Check the complete agenda.</a></p> +<p>Meet you in Melbourne!</p> + + + + + SNB Interactive Part 2: Modeling Choices + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + Tue, 26 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices/ + <p><a href="https://ldbcouncil.org/benchmarks/snb">​SNB Interactive</a> is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.</p> +<p>In the case of <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server">Virtuoso</a>, we have played with <a href="http://dbpedia.org/resource/SQL">SQL</a> and <a href="http://dbpedia.org/resource/SPARQL">SPARQL</a> implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that this allows to materialize multi-part indices and data orderings that make sense for the application. In other words, there is transparency into physical design. An RDF application may also have physical design by means ofstructure-aware storage but this is more complex and here we are just concerned with speed and having things work precisely as we intend.</p> +<h3 id="schema-design">Schema Design</h3> +<p>SNB has a regular schema described by a <a href="https://en.wikipedia.org/wiki/Unified_Modeling_Language">UML</a> diagram. This has a number of relationships of which some have attributes. There are no heterogenous sets, e.g. no need for run-time typed attributes or graph edges with the same label but heterogeneous end points. Translation into SQL or RDF is straightforward. Edges with attributes, e.g. the knows relation between people would end up represented as a subject with the end points and the date since as properties. The relational implementation has a two-part primary key and the date since as a dependent column. A native property graph database would use an edge with an extra property for this, as such are typically supported.</p> +<p>The only table-level choice has to do with whether <code>posts</code> and <code>comments</code> are kept in the same or different data structures. The Virtuoso schema has a single table for both, with nullable columns for the properties that occur only in one. This makes the queries more concise. There are cases where only non-reply posts of a given author are accessed. This is supported by having two author foreign key columns each with its own index. There is a single nullable foreign key from the reply to the post/comment being replied to.</p> +<p>The workload has some frequent access paths that need to be supported by index. Some queries reward placing extra columns in indices. For example, a common pattern is accessing the most recent posts of an author or group of authors. There, having a composite key <code>of ps_creatorid</code>, <code>ps_creationdate</code>, <code>ps_postid</code> pays off since the top-k on <code>creationdate</code> can be pushed down into the index without needing a reference to the table.</p> +<p>The implementation is free to choose data types for attributes, specifically datetimes. The Virtuoso implementation adopts the practice of the <a href="http://dbpedia.org/resource/DEX_(Graph_database)">Sparksee</a> and <a href="http://dbpedia.org/resource/Neo4j">Neo4J</a> implementations and represents this is a count of milliseconds since epoch. This is less confusing, faster to compare and more compact than a native datetime datatype that may or may not have timezones etc. Using a built-in datetime seems to be nearly always a bad idea. A dimension table or a number for a time dimension avoids the ambiguities of a calendar or at least makes these explicit.</p> +<p>The benchmark allows procedurally maintaining materializations of intermediate results for use by queries as long as these are maintained transaction by transaction. For example, each person could have the 20 newest posts by immediate contacts precomputed. This would reduce Q2 &ldquo;top of the wall&rdquo; to a single lookup. This dows not however appear to be worthwhile. The Virtuoso implementation does do one such materialization for Q14: A connection weight is calculated for every pair of persons that know each other. This is related to the count of replies by one or the other to content generated by the other. If there does not exist a single reply in either direction, the weight is taken to be 0. This weight is precomputed after bulk load and subsequently maintained each time a reply is added. The table for this is the only row-wise structure in the schema and represents a half matrix of connected people, i.e. <code>person1</code>, <code>person2</code> -&gt; <code>weight</code>. <code>Person1</code> is by convention the one with the smaller <code>p_personid</code>. Note that comparing id&rsquo;s in this way is useful but not normally supported by RDF systems. RDF would end up comparing strings of URI&rsquo;s with disastrous performance implications unless an implementation specific trick were used.</p> +<p>In the next installment we will analyze an actual run.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + Mon, 25 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/ldbc-participates-in-the-36th-edition-of-the-acm-sigmod-pods-conference/ + <p>LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.</p> +<p>On the industry track, LDBC will be presenting the <em>Social Network Benchmark Interactive Workload</em> by Orri Erling (OpenLink Software), Alex Averbuch (Neo Technology), Josep Larriba-Pey (Sparsity Technologies), Hassan Chafi (Oracle Labs), Andrey Gubichev (TU Munich), Arnau Prat (Universitat Politècnica de Catalunya), Minh-Duc Pham (VU University Amsterdam) and Peter Boncz (CWI).</p> +<p>You can read more about the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark here</a> and collaborate if you&rsquo;re interested!</p> +<p>The other presentation will be at the GRADES workshop within the SIGMOD program regarding <em>Graphalytics: A Big Data Benchmark for Graph-Processing platforms</em> by Mihai Capotă, Tim Hegeman, Alexandru Iosup (Delft University of Technology), Arnau Prat (Universitat Politècnica de Catalunya), Orri Erling (OpenLink Sotware) and Peter Boncz (CWI). We will provide more information about GRADES and this specific presentation in a following post as GRADES is part of the events organized by LDBC.</p> +<p>Don&rsquo;t forget to check our presentations if you&rsquo;re attending the SIGMOD!</p> + + + + + SNB Interactive Part 1: What Is SNB Interactive Really About? + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + Thu, 14 May 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about/ + <p>This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.</p> +<p>With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its performance characteristics are understood and even if these do not represent the maximum of the attainable, there are no glaring mistakes and the implementation represents a reasonable best effort by those who ought to know, namely the system vendors.</p> +<p>The essence of a benchmark is a set of trick questions or choke points, as LDBC calls them. A number of these were planned from the start. It is then the role of experience to tell whether addressing these is really the key to winning the race. Unforeseen ones will also surface.</p> +<p>So far, we see that SNB confronts the implementor with choices in the following areas:</p> +<ul> +<li>Data model: Relational, RF, property graph?</li> +<li>Physical model, e.g. row-wise vs. column wise storage</li> +<li>Materialized data ordering: Sorted projections, composite keys, replicating columns in auxxiliary data structures</li> +<li>Maintaining precomputed, materialized intermediate results, e.g. use of materialized views, triggers</li> +<li>Query optimization: join order/type, interesting physical data orderings, late projection, top k, etc.</li> +<li>Parameters vs. literals: Sometimes different parameter values result in different optimal query plans</li> +<li>Predictable, uniform latency: The measurement rules stipulate the SUT must not fall behind the simulated workload</li> +<li>Durability - how to make data durable while maintaining steady throughput? Logging vs. checkpointing.</li> +</ul> +<p>In the process of making a benchmark implementation, one naturally encounters questions about the validity, reasonability and rationale of the benchmark definition itself. Additionally, even though the benchmark might not directly measure certain aspects of a system, making an implementation will take a system past its usual envelope and highlight some operational aspects.</p> +<ul> +<li>Data generation - Generating a mid-size dataset takes time, e.g. 8 hours for 300G. In a cloud situation, keeping the dataset in S3 or similar is necessary, re-generating every time is not an option.</li> +<li>Query mix - Are the relative frequencies of the operations reasonable? What bias does this introduce?</li> +<li>Uniformity of parameters: Due to non-uniform data distributions in the dataset, there is easily a 100x difference between a &lsquo;fast&rsquo; and &lsquo;slow&rsquo; case of a single query template. How long does one need to run to balance these fluctuations?</li> +<li>Working set: Experience shows that there is a large difference between almost warm and steady state of working set. This can be a factor of 1.5 in throughput.</li> +<li>Are the latency constraints reasonable? In the present case, a qualifying run must have under 5% of all query executions starting over 1 second late. Each execution is scheduled beforehand and done at the intended time. If the SUT does not keep up, it will have all available threads busy and must finish some work before accepting new work, so some queries will start late. Is this a good criterion for measuring consistency of response time? There are some obvious possibilities of abuse.</li> +<li>Is the benchmark easy to implement/run? Perfection is open-ended and optimization possibilities infinite, albeit with diminishing returns. Still, getting startyed should not be too hard. Since systems will be highly diverse, testing that these in fact do the same thing is important. The SNB validation suite is good for this and given publicly available reference implementations, the effort of getting started is not unreasonable.</li> +<li>Since a Qualifying run must meet latency constraints while going as fast as possible, setting the performance target involves trial and error. Does the tooling make this easy?</li> +<li>Is the durability rule reasonable? Right now, one is not required to do checkpoints but must report the time to roll forward from the last checkpoint or initial state. Incenting vendors to build faster recovery is certainly good, but we are not through with all the implications. What about redundant clusters?</li> +</ul> +<p>The following posts will look at the above in light of actual experience.</p> +<h3 id="snb-interactive-series">SNB Interactive Series</h3> +<ul> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-1-what-is-snb-interactive-really-about">SNB Interactive, Part 1: What is SNB Interactive Really About?</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-2-modeling-choices">SNB Interactive, Part 2: Modeling Choices</a></li> +<li><a href="https://ldbcouncil.org/post/snb-interactive-part-3-choke-points-and-initial-run-on-virtuoso/">SNB Interactive, Part 3: Choke Points and Initial Run on Virtuoso</a></li> +</ul> + + + + + Why Do We Need an LDBC SNB-Specific Workload Driver? + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + Tue, 21 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/why-do-we-need-an-ldbc-snb-specific-workload-driver/ + <p>In a previous <a href="https://ldbcouncil.org/tags/driver">3-part blog series</a> we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn&rsquo;t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more traditional database benchmark workloads. Additionally, it will motivate why we chose to develop a new workload driver as part of this work, rather than using existing tooling that was developed in other database benchmarking efforts. To briefly recap, the task of the driver is to run a transactional database benchmark against large synthetic graph datasets - &ldquo;graph&rdquo; is the word that best captures the novelty and difficulty of this work.</p> +<p><strong>Workload Execution - Traditional vs Graph</strong></p> +<p>Transactional graph workloads differ from traditional relational workloads in several fundamental ways, one of them being the complex dependencies that exist between queries of a graph workload.</p> +<p>To understand what is meant by &ldquo;traditional relational workloads&rdquo;, take the classical TPC-C benchmark as an example. In TPC-C Remote Terminal Emulators (emulators) are used to issue update transactions in parallel, where the transactions issued by these emulators do not depend on one another. Note, &ldquo;dependency&rdquo; is used here in the context of scheduling, i.e., one query is dependent on another if it can not start until the other completes. For example, a New-Order transaction does not depend on other orders from this or other users. Naturally, the results of Stock-Level transactions depend on the items that were previously sold, but in TPC-C it is not an emulator&rsquo;s responsibility to enforce any such ordering. The scheduling strategy employed by TPC-C is tailored to the scenario where transactional updates do not depend on one another. In reality, one would expect to also have scheduling dependencies between transactions, e.g., checking the status of the order should only be done after the order is registered in the system. TPC-C, however, does not do this and instead only asks for the status of the last order <em>for a given user</em>. Furthermore, adding such dependencies to TPC-C would make scheduling only slightly more elaborate. Indeed, the Load Tester (LT) would need to make sure a New-Order transaction always precedes the read requests that check its status, but because users (and their orders) are partitioned across LTs, and orders belong to a particular user, this scheduling does not require inter-LT communication.</p> +<p>A significantly more difficult scheduling problem arises when we consider the SNB benchmark that models a real-world social network. Its domain includes users that form a social friendship graph and which leave posts/comments/likes on each others walls (forums). The update transactions are generated (exported as a log) by the data generator, with assigned timestamps, e.g. user 123 added post 456 to forum 789 at time T. Suppose we partition this workload by user, such that each driver gets all the updates (friendship requests, posts, comments and likes on other user&rsquo;s posts etc) initiated by a given user. Now, if the benchmark is to resemble a real-world social network, the update operations represent a highly connected (and dependent) network: a user should not create comments before she joins the network, a friendship request can not be sent to a non-existent user, a comment can only be added to a post that already exists, etc. Given a user partitioning scheme, most such dependencies would cross the boundaries between driver threads/processes, because the correct execution of update operations requires that the social network is in a particular state, and that state depends on the progress of other threads/processes.</p> +<p>Such scheduling dependencies in the SNB workload essentially replicate the underlying graph-like shape of its dataset. That is, every time a user comments on a friend&rsquo;s wall, for example, there is a dependency between two operations that is captured by an edge of the social graph. <em>Partitioning the workload among the LTs therefore becomes equivalent to graph partitioning, a known hard problem.</em></p> +<p><strong>Because it&rsquo;s a graph</strong></p> +<p>In short, unlike previous database benchmarking efforts, the SNB workload has necessitated a redefining of the state-of-the-art in workload execution. It is no longer sufficient to rely solely on workload partitioning to safely capture inter-query dependencies in complex database benchmark workloads. The graph-centric nature of SNB introduces new challenges, and novel mechanisms had to be developed to overcome these challenges. To the best of our knowledge, the LDBC SNB Interactive benchmark is the first benchmark that requires a non-trivial partitioning of the workload, among the benchmark drivers. In the context of workload execution, our contribution is therefore the principled design of a driver that executes dependent update operations in a performant and scalable way, across parallel/distributed LTs, while providing repeatable, vendor-independent execution of the benchmark.</p> + + + + + Event Driven Post Generation in Datagen + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + Fri, 10 Apr 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/event-driven-post-generation-in-datagen/ + <p>As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.</p> +<p>First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), the activity generation starts. Persons are divided into blocks of 10k, in the same way they are during friendship edges generation process. Then, for each person of the block, three types of forums are created:</p> +<ul> +<li> +<p>The wall of the person</p> +</li> +<li> +<p>The albums of the person</p> +</li> +<li> +<p>The groups where the person is a moderator</p> +</li> +</ul> +<p>We will put our attention to group generation, but the same concepts apply to the other types of forums. Once a group is created, the members of the group are selected. These are selected from either the friends of the moderator, or random persons within the same block.</p> +<p>After assigning the members to the group, the post generation starts. We have two types of post generators, the uniform post generator and the event based post generator. Each post generator is responsible of, given a forum, generate a set of posts for the forum, whose authors are taken from the set of members of the forum. The uniform post generator distributes the dates of the generated posts uniformly in the time line (from the date of the membership until the end of the simulation time). On the other hand, the event based post generator assigns dates to posts, based on what we call “flashmob events”.</p> +<p>Flashmob events are generated at the beginning of the execution. Their number is predefined by a configuration parameter which is set to 30 events per month of simulation, and the time of the event is distributed uniformly along all the time line. Also, each event has a volume level assigned (between 1 and 20) following a power law distribution, which determines how relevant or important the event is, and a tag representing the concept or topic of the event. Two different events can have the same tag. For example, one of the flashmob events created for SF1 is one related to &ldquo;Enrique Iglesias&rdquo; tag, whose level is 11 and occurs on 29th of May of 2012 at 09:33:47.</p> +<p>Once the event based post generation starts for a given group, a subset of the generated flashmob events is extracted. These events must be correlated with the tag/topic of the group, and the set of selected events is restricted by the creation date of the group (in a group one cannot talk about an event previous to the creation of the group). Given this subset of events and their volume level, a cumulative probability distribution (using the events sorted by event date and their level) is computed, which is later used to determine to which event a given post is associated. Therefore, those events with a larger lavel will have a larger probability to receive posts, making their volume larger. Then, post generation starts, which can be summarized as follows:</p> +<ul> +<li> +<p>Determine the number of posts to generate</p> +</li> +<li> +<p>Select a random member of the group that will generate the post</p> +</li> +<li> +<p>Determine the event the post will be related to given the aforementioned cumulative distribution</p> +</li> +<li> +<p>Assign the date of the post based on the event date</p> +</li> +</ul> +<p>In order to assign the date to the post, based on the date of the event the post is assigned to, we follow the following probability density, which has been extracted from <a href="#references">[1]</a>. The shape of the probability density consists of a combination of an exponential function in the 8 hour interval around the peak, while the volume outside this interval follows a logarithmic function. The following figure shows the actual shape of the volume, centered at the date of the event.</p> +<p><img src="index.png" alt=""></p> +<p>Following the example of &ldquo;Enrique Iglesias&rdquo;, the following figure shows the activity volume of posts around the event as generated by Datagen.</p> +<p><img src="index2.png" alt=""></p> +<p>In this blog entry we have seen how datagen creates event driven user activity. This allows us to reproduce the heterogenous post creation density found in a real social network, where post creation is driven by real world events.</p> +<h4 id="references">References</h4> +<p>[1] Jure Leskovec, Lars Backstrom, Jon M. Kleinberg: Meme-tracking and the dynamics of the news cycle. KDD 2009: 497-506</p> + + + + + Sixth TUC Meeting + https://ldbcouncil.org/event/sixth-tuc-meeting/ + Thu, 19 Mar 2015 13:53:33 -0400 + + https://ldbcouncil.org/event/sixth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its Sixth Technical User Community (TUC) meeting.</p> +<p>This will be a two-day event at Universitat Politècnica de Catalunya, Barcelona on <strong>Thursday and Friday March 19/20, 2015.</strong></p> +<p>The LDBC FP7 EC funded project is reaching its finalisation, and this will be the last event sponsored directly by the project. However, tasks within LDBC will continue based on the LDBC independent organisation. The event will basically set the following aspects:</p> +<ul> +<li>Two day event with one day devoted to User&rsquo;s experiences and one day devoted to benchmarking experiences.</li> +<li>Presentation of the first benchmarking results for the different benchmarks.</li> +<li>Interaction with the new LDBC Board of Directors and the whole new LDBC organisation officials.</li> +<li>Pre-event with the 3rd Graph-TA workshop organised on March 18th at the same premises, with a lot of interaction and interesting research presentations.</li> +</ul> +<p>We welcome all users of RDF and Graph technologies to attend. If you are interested, please, contact <a href="mailto:damaris@ac.upc.edu">damaris@ac.upc.edu</a>.</p> +<h3 id="agenda">Agenda</h3> +<p><strong>Thursday 19th March</strong></p> +<p>11:00 - 11:30 Registration, coffee break and welcome (Josep Larriba Pey)</p> +<p>11:30 - 12:00 LDBC introduction and status update (Peter Boncz) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981131.pdf">slides</a></p> +<p>12:00 - 13:30 Technology and benchmarking (chair: Peter Boncz)</p> +<p>12:00 Venelin Kotsev (Ontotext). Semantic Publishing Benchmark v2.0. – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981137.pdf">slides</a></p> +<p>12:30 Nina Saveta (FORTH). SPIMBENCH: A Scalable, Schema-Aware, Instance Matching Benchmark for the Semantic Publishing Domain</p> +<p>12:50 Tomer Sagi (HP). Titan DB on LDBC SNB Interactive</p> +<p>13:10 Claudio Martella (VUA): Giraph and Lighthouse</p> +<p>13:30 - 14:30 Lunch break</p> +<p>14:30 - 16:00 Applications and use of Graph Technologies (chair: Hassan Chafi)</p> +<p>14:30 Jerven Bolleman (Swiss Institute of Bioinformatics): 20 billion triples in production <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981132.pdf">slides</a></p> +<p>14:50 Mark Wilkinson (Universidad Politécnica de Madrid): Design principles for Linked-Data-native Semantic Web Services <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981133.pdf">slides</a></p> +<p>15:10 Peter Haase (Metaphacts, Systap LLC): Querying the Wikidata Knowledge Graph <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981139.pdf">slides</a></p> +<p>15:30 Esteban Sota (GNOSS): Human Interaction with Faceted Searching Systems for big or complex graphs</p> +<p>18:30 - 20:00 Cultural visit Barcelona city center. Meet at Plaça Catalunya.</p> +<p>20:00 Social dinner at <a href="http://www.bastaix.com">Bastaix Restaurant</a>.</p> +<p><strong>Friday 20th March</strong></p> +<p>9:30 - 11:00 Technology and Benchmarking (chair: Josep L. Larriba-Pey)</p> +<p>9:30 Yinglong Xia (IBM): Towards Temporal Graph Management and Analytics</p> +<p>9:50 Alexandru Iosup (TU Delft). Graphalytics: A big data benchmark for graph-processing platforms</p> +<p>10:10 John Snelson (MarkLogic): Introduction to MarkLogic</p> +<p>10:30 Arnau Prat (UPC-Sparsity Technologies) and Alex Averbuch (Neo): Social Network Benchmark, Interactive Workload</p> +<p>10:50 Moritz Kaufmann. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/moritz-kaufmann-ldbc-snb-benchmark-auditing-6th-ldbc-tuc.pdf">The auditing experience</a></p> +<p>11:15 - 11:45 Coffee break</p> +<p>11:45 - 12:45 Applications and use of Graph Technologies (chair: Atanas Kiryakov)</p> +<p>11:45 Boris Motik (Oxford University): Parallel and Incremental Materialisation of RDF/Datalog in RDFox</p> +<p>12:05 Andreas Both (Unister): E-Commerce and Graph-driven Applications: Experiences and Optimizations while moving to Linked Data</p> +<p>12:25 Smrati Gupta (CA Technologies). Modaclouds Decision Support System in multicloud environments</p> +<p>12:45 Peter Boncz. Conclusions for the LDBC project and future perspectives. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/6881717/6981138.pdf">slides</a></p> +<p>13:30 - 14:30 Lunch break</p> +<p>15:00 LDBC Board of Directors</p> +<h3 id="logistics">Logistics</h3> +<h6 id="date"><strong>Date</strong></h6> +<p>19th and 20th March 2015</p> +<h6 id="venue"><strong>Venue</strong></h6> +<p>The TUC meeting will be held at &ldquo;Aula Master&rdquo; at A3 building located inside the &ldquo;Campus Nord UPC&rdquo; in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h5 id="maps-and-situation"><strong>Maps and situation</strong></h5> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<h5 id="finding-upc"><strong>Finding UPC</strong></h5> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<h5 id="finding-the-meeting-room"><strong>Finding the meeting room</strong></h5> +<h5 id="getting-there">Getting there</h5> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to<br> +the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €20 and trips to other destinations in the city cost approximately €25-30.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/sixth-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<h5 id="the-locations-of-the-airport-and-the-city-centre"><strong>The locations of the airport and the city centre</strong></h5> + + + + + The LDBC Datagen Community Structure + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + Sun, 15 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/the-ldbc-datagen-community-structure/ + <p>This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.</p> +<p>When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and specially social networks, have typically highly skewed degree distributions with a long tail, a moderatelly large clustering coefficient and an appreciable community structure.</p> +<p>The first two characteristics are deliberately modeled in DATAGEN. DATAGEN generates persons with a degree distribution that matches that observed in Facebook, and thanks to the attribute correlated edge generation process, we obtain graphs with a moderately large clustering coefficient. But what about the community structure of graphs generated with DATAGEN? The answer can be found in the paper titled “How community-like is the structure of synthetically generated graphs”, which was published in GRADES 2014 <a href="#references">[1]</a>. Here we summarize the paper and its contributions and findings.</p> +<p>Existing synthetic graph generators such as Rmat <a href="#references">[1]</a> and Mag <a href="#references">[2]</a>, are graphs generators designed to produce graphs with long tailed distributions and large clustering coefficient, but completely ignore the fact that real graphs are structured into communities. For this reason, Lancichinetti et al. proposed LFR <a href="#references">[3]</a>, a graph generator that did not only produced graphs with realistic high level characteristics, but enforced an appreciable community structure. This generator, has become the de facto standard for benchmarking community detection algorithms, as it does not only outputs a graph but also the communities present in that graph, hence it can be used to test the quality of a community detection algorithm.</p> +<p>However, no one studied if the community structure produced by LFR, was in fact realistic compared to real graphs. Even though the community structure in LFR exhibit interesting properties, such as the expected larger internal density than external, or a longtailed distribution of community sizes, they lack the noise and inhomogeneities present in a real graph. And more importantly, how does the community structure of DATAGEN compares to that exhibited in LFR and reap graphs? Is it more or less realistic? The authors of <a href="#references">[1]</a> set up an experiment where they analized the characteristics of the communities output by LFR, and the groups (groups of people interested in a given topic) output by DATAGEN, and compared them to a set of real graphs with metadata. These real graphs, which can be downloaded from the Snap project website, are graphs that have recently become very popular in the field of community detection, as they contain ground truth communities extracted from their metadata. The ground truth graphs used in this experiment are shown in the following table. For more details about how this ground truth is generated, please refer to <a href="#references">[4]</a>.</p> +<table> +<thead> +<tr> +<th></th> +<th><em>Nodes</em></th> +<th><em>Edges</em></th> +</tr> +</thead> +<tbody> +<tr> +<td><em>Amazon</em></td> +<td>334863</td> +<td>925872</td> +</tr> +<tr> +<td><em>Dblp</em></td> +<td>317080</td> +<td>1049866</td> +</tr> +<tr> +<td><em>Youtube</em></td> +<td>1134890</td> +<td>2987624</td> +</tr> +<tr> +<td><em>Livejournal</em></td> +<td>3997962</td> +<td>34681189</td> +</tr> +</tbody> +</table> +<p>The authors of <a href="#references">[1]</a> selected a set of statistical indicators to<br> +characterize the communities:</p> +<ul> +<li>The clustering coefficient</li> +<li>The triangle participation ration (TPR), which is the ratio of nodes that close at least one triangle in the community.</li> +<li>The bridge ratio, which is the ratio of edges whose removal disconnects the community.</li> +<li>The diameter</li> +<li>The conductance</li> +<li>The size</li> +</ul> +<p>The authors start by analyzing each community of the ground truth graphs using the above statistical indicators and ploting the distributions of each of them. The following are the plots of the Livejournal graph. We summarize the findings of the authors regarding real graphs: + Several indicators (Clustering Coefficient, TPR and Bridge ratio) exihibit a multimodal distribution, with two peaks aht their extremes.</p> +<ul> +<li>Many of the communities (44%) have a small clustering coefficient between 0 and 0.01. Out of them, 56% have just three vertices. On the other hand, 11% of the communities have a clustering coefficient between 0.99 and 1.0. In between, communities exhibit different values of clustering coefficients. This trend is also observed for TPR and Bridgeratio. This suggests that communities cannot be modeled using a single model. * 84% of the communities have a diameter smaller than five, suggesting that ground truth communities are small and compact * Ground truth communities are not very isolated, they have a lot of connections pointing outside of the community.</li> +<li>Most of the communities are small (10 or less nodes).</li> +<li>In general, ground truth communities are, small with a low diameter, not isolated and with different ranges of internal connectivity.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index.png" alt=""></td> +<td style="text-align:center"><img src="index2.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index3.png" alt=""></td> +<td style="text-align:center"><img src="index4.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">Diameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index5.png" alt=""></td> +<td style="text-align:center"><img src="index6.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The authors performed the same experiment but for DATAGEN and LFR graphs. They generated a graph of 150k nodes, using their default parameters. In the case of LFR, they tested five different values of the mixing factor, which specifies the ratio of edges of the community pointing outside of the community, They ranged this value from 0 to 0.5. The following are the distributions for DATAGEN.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index8.png" alt=""></td> +<td style="text-align:center"><img src="index9.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index10.png" alt=""></td> +<td style="text-align:center"><img src="index11.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index11.png" alt=""></td> +<td style="text-align:center"><img src="index12.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>The main conclusions that can be extracted from DATAGEN can be summarized asfollows:</p> +<ul> +<li>DATAGEN is able to reproduce the multimodal distribution observed for clustering coefficient, TPR and bridge ratio.</li> +<li>The central part of the clustering coefficient is biased towards the left, in a similar way as observed for the youtube and livejournal graphs.</li> +<li>Communities of DATAGEN graphs are not, as in real graphs, isolated, but in this case their level of isolation if significantly larger.</li> +<li>The diameter is small like in the real graphs.</li> +<li>It is significant that communities in DATAGEN graphs are closer to those observed in Youtube and Livejournal, as these are social networks like the graphs produced by DATAGEN. We see that DATAGEN is able to reproduce many of their characteristics.</li> +</ul> +<p>Finally, the authors repeat the same experiment for LFR graphs. The following are the plots for the LFR graph with mixing ratio 0.3. From them, the authors extract the following conclusions:</p> +<ul> +<li>LFR graphs donot show the multimodal distribution observed in real graphs</li> +<li>Only the diameter shows a similar shape as in the ground truth.</li> +</ul> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index13.png" alt=""></td> +<td style="text-align:center"><img src="index14.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index15.png" alt=""></td> +<td style="text-align:center"><img src="index16.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index17.png" alt=""></td> +<td style="text-align:center"><img src="index18.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>To better quanify how similar are the distribuions between the different graphs, the authors also show the correlograms for each of the statisticsl indicators. These correlograms, contain the Spearman&rsquo;s correlation coefficient between each pair of graphs for a given statistical indicator. The more blue the color, the better the correlation is. We see that DATAGEN distributions correlate very well with those observed in real graphs, specially as we commented above, with Youtube and Livejournal. On the other hand, LFR only succeds significantly in the case of the Diameter.</p> +<table> +<thead> +<tr> +<th style="text-align:center"></th> +<th style="text-align:center"></th> +</tr> +</thead> +<tbody> +<tr> +<td style="text-align:center"><img src="index19.png" alt=""></td> +<td style="text-align:center"><img src="index20.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Clustering Coefficient</td> +<td style="text-align:center">TPR</td> +</tr> +<tr> +<td style="text-align:center"><img src="index21.png" alt=""></td> +<td style="text-align:center"><img src="index22.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Bridge Ratio</td> +<td style="text-align:center">TPRDiameter</td> +</tr> +<tr> +<td style="text-align:center"><img src="index23.png" alt=""></td> +<td style="text-align:center"><img src="index24.png" alt=""></td> +</tr> +<tr> +<td style="text-align:center">Conductance</td> +<td style="text-align:center">Size</td> +</tr> +</tbody> +</table> +<p>We see that DATAGEN is able to reproduce a realistics community structure, compared to existing graph generators. This feature, could be potentially exploited to define new benchmakrs to measure the quality of novel community detection algorithms. Stay tuned for future blog posts about his topic!</p> +<h4 id="references">References</h4> +<p>[1] Arnau Prat-Pérez, <a href="http://dblp.uni-trier.de/pers/hd/d/Dom=iacute=nguez=Sal:David">David Domínguez-Sal</a>: How community-like is the structure of synthetically generated graphs? <a href="http://dblp.uni-trier.de/db/conf/sigmod/grades2014.html#PratD14">GRADES 2014</a></p> +<p>[2] Deepayan Chakrabarti, Yiping Zhan, and ChristosFaloutsos. R-mat: A recursive model for graph mining. SIAM 2014</p> +<p>[3] Myunghwan Kim and Jure Leskovec. Multiplicative attribute graph model of real-world networks. Internet Mathematics</p> +<p>[4] Andrea Lancichinetti, Santo Fortunato, and Filippo Radicchi. Benchmark graphs for testing community detection algorithms. Physical Review E 2008.</p> + + + + + Industry Relevance of the Semantic Publishing Benchmark + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + Tue, 03 Mar 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/industry-relevance-of-the-semantic-publishing-benchmark/ + <h3 id="publishing-and-media-businesses-are-going-through-transformation">Publishing and media businesses are going through transformation</h3> +<p>I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and wrestling my jetlag in front of Hilton. In the lobby inside the SemTech 2010 conference attendants were watching a game from the FIFA World Cup in South Africa. In the picture, the self-service newspaper stand is empty, except for one free paper. It was not long ago, in the year 2000, this stand was full. Back than the people in the Bay area were willing to pay for printed newspapers. But this is no longer true.</p> +<p>What’s driving this change in publishing and media?</p> +<ul> +<li> +<p>Widespread and instantaneous distribution of information over the Internet has turned news into somewhat of a &ldquo;commodity&rdquo; and few people are willing to pay for it</p> +</li> +<li> +<p>The wealth of free content on YouTube and similar services spoiled the comfort of many mainstream broadcasters;</p> +</li> +<li> +<p>Open access publishing has limited academic publishers to sell journals and books at prices that were considered fair ten years ago.</p> +</li> +</ul> +<p><em>Alongside other changes in the industry, publishers figured out that it is critical to add value through better authoring, promotion, discoverability, delivery and presentation of precious content.</em></p> +<h3 id="imagine-instant-news-in-context-imagine-personal-channels-imagine--triplestores">Imagine instant news in context, Imagine personal channels, Imagine &hellip; triplestores</h3> +<p>While plain news can be created repeatedly, premium content and services are not as easy to create. Think of an article that not only tells the new facts, but refers back to previous events and is complemented by an info-box of relevant facts. It allows one to interpret and comprehend news more effectively. This is the well-known journalistic aim to put news in context. It is also well-known that producing such news in &ldquo;near real time&rdquo; is difficult and expensive using legacy processes and content management technology.</p> +<p>Another example would be a news feed that delivers good coverage of information relevant to a narrow subject – for example a company, a story line or a region. Judging by the demand for intelligent press clipping services like <a href="http://new.dowjones.com/products/factiva/">Factiva</a>, such channels are in demand but are not straightforward to produce with today’s technology. Despite the common perception that automated recommendations for related content and personalized news are technology no-brainers, suggesting truly relevant content is far from trivial.</p> +<p>Finally, if we use an example in life sciences, the ability to quickly find scientific articles discussing asthma and x-rays, while searching for respiration disorders and radiation, requires a search service that is not easy to deliver.</p> +<p>Many publishers have been pressed to advance their business. This, in turn, had led to quest to innovate. And semantic technology can help publishers in two fundamental ways:</p> +<ol> +<li>Generation of rich and &ldquo;meaningful&rdquo; (trying not to use &ldquo;semantic&rdquo; :-) metadata descriptions; 1. Dynamic retrieval of content, based on this rich metadata, enabling better delivery.</li> +</ol> +<p>In this post I write about &ldquo;semantic annotation&rdquo; and how it enables application scenarios like BBC’s Dynamic Semantic Publishing (DSP). I will also present the business case behind DSP. The final part of the post is about triplestores – semantic graph database engines, used in DSP. To be more concrete I write about the Semantic Publishing Benchmark (SPB), which evaluates the performance of triplestores in DSP scenarios.</p> +<h3 id="semantic-annotation-produces-rich-metadata-descriptions--the-fuel-for-semantic-publishing">Semantic Annotation produces Rich Metadata Descriptions – the fuel for semantic publishing</h3> +<p>The most popular meaning of &ldquo;semantic annotation&rdquo; is the process of enrichment of text with links to (descriptions of) concepts and entities mentioned in the text. This usually means tagging either the entire document or specific parts of it with identifiers of entities. These identifiers allow one to retrieve descriptions of the entities and relations to other entities – additional structured information that fuels better search and presentation.</p> +<p><img src="02_semantic_repository.png" alt=""></p> +<p>The concept of using <a href="http://infosys3.elfak.ni.ac.rs/nastava/attach/SemantickiWebKurs/sdarticle.pdf">text-mining for automatic semantic annotation</a> of text with respect to very large datasets, such as <a href="http://dbpedia.org/">DBPedia</a>, emerged in early 2000. In practical terms it means using such large datasets as a sort of gigantic gazetteer (name lookup tool) and the ability to disambiguate. Figuring out whether &ldquo;Paris&rdquo; in the text refers to the capital of France or to Paris, Texas, or to Paris Hilton is crucial in such context. Sometimes this is massively difficult – try to instruct a computer how to guess whether &ldquo;Hilton&rdquo; in the second sentence of this post refers to a hotel from the chain founded by her grandfather or that I had the chance to meet Paris Hilton in person on the street in San Francisco.</p> +<p>Today there are plenty of tools (such as the <a href="https://www.ontotext.com/semantic-solutions/media-publishing/">Ontotext Media and Publishing</a> platform and <a href="https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki">DBPedia Spotlight</a>) and services (such as Thomson Reuter’s <a href="http://www.opencalais.com/">OpenCalais</a> and Ontotext’s <a href="http://s4.ontotext.com">S4</a>) that offer automatic semantic annotation. Although text-mining cannot deliver 100% correct annotations, there are plenty of scenarios, where technology like this would revoluntionize a business. This is the case with the Dynamic Semantic Publishing scenario described below.</p> +<h3 id="the-bbcs-dynamic-semantic-publishing-dsp">The BBC’s Dynamic Semantic Publishing (DSP)</h3> +<p>Dynamic Semantic Publishing is a model for using semantic technology in media developed by a group led by John O’Donovan and Jem Rayfield at the BBC. The implementation of DSP behind BBC’s FIFA World Cup 2010 website was the first high-profile success story for usage of semantic technology in media. It is also the basis for the SPB benchmark – sufficient reasons to introduce this use case at length below.</p> +<p>BBC Future Media &amp; Technology department have transformed the BBC relational content management model and static publishing framework to a fully dynamic semantic publishing architecture. With minimal journalistic management, media assets are being enriched with links to concepts, semantically described in a triplestore. This novel semantic approach provides improved navigation, content re-use and re-purposing through automatic aggregation and rendering of links to relevant stories. At the end of the day DSP improves the user experience on BBC’s web site.</p> +<p><em>&ldquo;A high-performance dynamic semantic publishing framework facilitates the publication of automated metadata-driven web pages that are light-touch, requiring minimal journalistic management, as they automatically aggregate and render links to relevant stories&rdquo;.</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">Jem Rayfield, Senior Technical Architect</a>, BBC News and Knowledge</p> +<p>The Dynamic Semantic Publishing (DSP) architecture of the BBC curates and publishes content (e.g. articles or images) based on embedded Linked Data identifiers, ontologies and associated inference. It allows for journalists to determine levels of automation (&ldquo;edited by exception&rdquo;) and support semantic advertisement placement for audiences outside of the UK. The following quote explains the workflow when a new article gets into BBC’s content management system.</p> +<p><em>&ldquo;In addition to the manual selective tagging process, journalist-authored content is automatically analysed against the World Cup ontology. A <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#language">natural language and ontological determiner process</a> automatically extracts World Cup concepts embedded within a textual representation of a story. The concepts are moderated and, again, selectively applied before publication. Moderated, automated concept analysis improves the depth, breadth and quality of metadata publishing.</em></p> +<p><img src="03_bbc_sport.png" alt=""></p> +<p><em>Journalist-published metadata is captured and made persistent for querying using the resource description framework (<a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#RDF"><em>RDF</em></a>) metadata representation and triple store technology. <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#BigOWLIM">A RDF triplestore</a> and <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html#SPARQL">SPARQL</a> approach was chosen over and above traditional relational database technologies due to the requirements for interpretation of metadata with respect to an ontological domain model. The high level goal is that the domain ontology allows for intelligent mapping of journalist assets to concepts and queries. The chosen triplestore provides reasoning following the forward-chaining model and thus implied inferred statements are automatically derived from the explicitly applied journalist metadata concepts. For example, if a journalist selects and applies the single concept &ldquo;Frank Lampard&rdquo;, then the framework infers and applies concepts such as &ldquo;England Squad&rdquo;, &ldquo;Group C&rdquo; and &ldquo;FIFA World Cup 2010&rdquo; &hellip;&rdquo;</em> &ndash; Jem Rayfield</p> +<p>One can consider each of the &ldquo;aggregation pages&rdquo; of BBC as a sort of feed or channel serving content related to a specific topic. If you take this perspective, with its World Cup 2010 website BBC was able to provide more than 700 thematic channels.</p> +<p><em>&ldquo;The World Cup site is a large site with over 700 aggregation pages (called index pages) designed to lead you on to the thousands of story pages and content</em></p> +<p><strong>…</strong><strong><em>we are not publishing pages, but publishing content</em></strong> <em>as assets which are then organized by the metadata dynamically into pages, but could be re-organized into any format we want much more easily than we could before.</em></p> +<p><img src="04_content_tagging.png" alt=""></p> +<p><em>… The index pages are published automatically. This process is what assures us of the highest quality output, but still <strong>save large amounts of time</strong> in managing the site and <strong>makes it possible for us to efficiently run so many pages</strong> for the World Cup.&rdquo;</em> &ndash; <a href="http://www.bbc.co.uk/blogs/bbcinternet/2010/07/the_world_cup_and_a_call_to_ac.html">John O&rsquo;Donovan, Chief Technical Architect, BBC Future Media &amp; Technology</a></p> +<p>To get a real feeling about the load of the triplestore behind BBC&rsquo;s World Cup web site, here are some statistics:</p> +<ul> +<li> +<p>800+ aggregation pages (Player, Team, Group, etc.), generated through SPARQL queries;</p> +</li> +<li> +<p>Average unique page requests/day: 2 million;</p> +</li> +<li> +<p>Average <strong>SPARQL queries/day: 1 million;</strong></p> +</li> +<li> +<p><strong>100s repository updates/inserts per minute</strong> with OWL 2 RL reasoning;</p> +</li> +<li> +<p>Multi data center that is fully resilient, clustered 6 node triplestore.</p> +</li> +</ul> +<h3 id="the-semantic-publishing-benchmark">The Semantic Publishing Benchmark</h3> +<p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volumes of read requests (SPARQL queries collecting recent content and data to generate web pages on a specific subject, e.g. Frank Lampard).</p> +<p>SPB simulates a setup for media that deals with large volumes of streaming content, e.g. articles, pictures, videos. This content is being enriched with metadata that describes it through links to reference knowledge:</p> +<ul> +<li> +<p><em>Reference knowledge:</em> taxonomies and databases that include relevant concepts, entities and factual information (e.g. sport statistics);</p> +</li> +<li> +<p><em>Metadata</em> for each individual piece of content allows publishers to efficiently produce live streams of content relevant to specific subjects.</p> +</li> +</ul> +<p>In this scenario the triplestore holds both reference knowledge and metadata. The main interactions with the repository are of two types:</p> +<ul> +<li> +<p><em>Aggregation queries</em> retrieve content according to various criteria. There are two sets (mixes) of aggregation queries. The basic one includes interactive queries that involve retrieval of concrete pieces of content, as well as aggregation functions, geo-spatial and full-text search constraints. The analytical query mix includes analytical queries, faceted search and drill-down queries;</p> +</li> +<li> +<p><em>Updates</em>, adding new metadata or updating the reference knowledge. It is important that such updates should immediately impact the results of the aggregation queries. Imagine a fan checking the page for Frank Lampard right after he scored a goal – she will be very disappointed to see out of date statistics there.</p> +</li> +</ul> +<p>SPB v.1.0 directly reproduces the DSP setup at the BBC. The reference dataset consists of BBC Ontologies (Core, Sport, News), BBC datasets (list of F1 teams, MPs, etc.) and an excerpt from <a href="http://www.geonames.org/">Geonames</a> for the UK. The benchmark is packed with metadata generator that allows one to set up experiments at different scales. The metadata generator produces 19 statements per Creative Work (BBC’s slang for all sorts of media assets). The standard scale factor is 50 million statements.</p> +<p>A more technical introduction to SPB can be found in this <a href="https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark">post</a>. Results from experiments with SPB on different hardware configurations, including AWS instances, are available in this <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">post</a>. An interesting discovery is that given the current state of the technology (particularly the GraphDB v.6.1 engine) and today’s cloud infrastructure, the load of BBC’s World Cup 2010 website can be handled at AWS by a cluster that costs only $81/day.</p> +<p>Despite the fact that SPB v.1.0 follows closely the usage scenario for triplestores in BBC’s DSP incarnations, it is relevant to a wide range of media and publishing scenarios, where large volumes of &ldquo;fast flowing&rdquo; content need to be &ldquo;dispatched&rdquo; to serve various information needs of a huge number of consumers. The main challenges can be summarized as follows:</p> +<ul> +<li> +<p>The Triplestore is used as operational database serving a massive number of read queries (hundreds of queries per second) in parallel with tens of update transactions per second. Transactions need to be handled instantly and in a reliable and consistent manner;</p> +</li> +<li> +<p>Reasoning is needed to map content descriptions to queries in a flexible manner;</p> +</li> +<li> +<p>There are specific requirements, such as efficient handling of full-text search, geo-spatial and temporal constraints.</p> +</li> +</ul> +<h3 id="spb-v20--steeper-for-the-engines-closer-to-the-publishers">SPB v.2.0 – steeper for the engines, closer to the publishers</h3> +<p>We are in the final testing of the new version 2.0 of SPB. The benchmark has evolved to allow for retrieval of semantically relevant content in a more advanced manner and at the same time to demonstrate how triplestores can offer simplified and more efficient querying.</p> +<p>The major changes in SPB v.2.0 can be summarized as follows:</p> +<ul> +<li> +<p>Much bigger reference dataset: from 170 thousand to 22 million statements. Now it includes GeoNames data about all of Europe (around 7 million statements) and DBPedia data about companies, people and events (14 million statements). This way we can simulate media archives described against datasets with good global coverage for specific types of objects. Such large reference sets also provide a better testing ground for experiments with very large content archives – think of 50 million documents (1 billion statements) or more;</p> +</li> +<li> +<p>Better interconnected reference data: more than 5 million links between entities, including 500,000 owl:sameAs links between DBPedia and Geonames descriptions. The latter evaluates the capabilities of the engine to deal with data coming from multiple sources, which use different identifiers for one and the same entity;</p> +</li> +<li> +<p>Retrieval of relevant content through links in the reference data, including inferred ones. To this end it is important than SPB v.2.0 involves much more comprehensive inference, particularly with respect to transitive closure of parent-company and geographic nesting chains.</p> +</li> +</ul> + + + + + OWL-Empowered SPARQL Query Optimization + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + Wed, 18 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/owl-empowered-sparql-query-optimization/ + <p>The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution plans for a given query.</p> +<p>In this post, we propose a different approach to query optimization, which is meant to complement (rather than replace) the standard optimization methodologies for SPARQL queries. Our approach is based on the use of schema information, encoded using OWL constructs, which often accompany Linked Data.</p> +<p>OWL adopts the Open World Assumption and hence OWL axioms are perceived primarily to infer new knowledge. Nevertheless, ontology designers consider OWL as an expressive schema language used to express constraints for validating the datasets, hence following the Closed World Assumption when interpreting OWL ontologies. Such constraints include disjointness/equivalence of classes/properties, cardinality constraints, domain and range restrictions for properties and others.</p> +<p>This richness of information carried over by OWL axioms can be the basis for the development of schema-aware techniques that will allow significant improvements in the performance of existing RDF query engines when used in tandem with data statistics or even other heuristics based on patterns found in SPARQL queries. As a simple example, a cardinality constraint at the schema level can provide a hint on the proper join ordering, even if data statistics are missing or incomplete.</p> +<p>The aim of this post is to show that the richness of information carried over by OWL axioms under the Close World Assumption can be the basis for the development of schema-aware optimization techniques that will allow considerable improvement for query processing. To attain this objective, we discuss a small set of interesting cases of OWL axioms; a full list can be found <a href="LDBC_D4.4.2_final.pdf">here</a>.</p> +<h3 id="schema-based-optimization-techniques">Schema-Based Optimization Techniques</h3> +<p>Here we provide some examples of queries, which, when combined with specific schema constraints expressed in OWL, can help the optimizer in formulating the (near to) optimal query plans.</p> +<p>A simple first case is the case of constraint violation. Consider the query below, which returns all instances of class <code>&lt;A&gt;</code> which are fillers of a specific property <code>&lt;P&gt;</code>. If the underlying schema contains the information that the range of <code>&lt;P&gt;</code> is class <code>&lt;B&gt;</code>, and that class <code>&lt;B&gt;</code> is disjoint from class <code>&lt;A&gt;</code>, then this query should return the empty result, with no further evaluation (assuming that the constraints associated with the schema are satisfied by the data). An optimizer that takes into account schema information should return an empty result in constant time instead of trying to optimize or evaluate the large star join.</p> +<pre tabindex="0"><code>SELECT ?v +WHERE { ?v rdf : type &lt;A&gt; . + ?u &lt;P&gt; ?v . ?u &lt;P&gt; ?v1 . + ?u &lt;P1 &gt; ?v2 . ?u &lt;P2 &gt; ?v3 . + ?u &lt;P3 &gt; ?v4 . ?u &lt;P4 &gt; ?v5} +</code></pre><p>Schema-aware optimizers could also prune the search space by eliminating results that are known a priori not to be in the answer set of a query. The query above is an extreme such example (where all potential results are pruned), but other cases are possible, such as the case of the query below, where all subclasses of class <code>&lt;A1&gt;</code> can immediately be identified as not being in the answer set.</p> +<pre tabindex="0"><code>SELECT ?c +WHERE { ?x rdf: type ?c . ?x &lt;P&gt; ?y . + FILTER NOT EXISTS \{ ?x rdf: type &lt;A1 &gt; }} +</code></pre><p>Another category of schema-empowered optimizations has to do with improved selectivity estimation. In this respect, knowledge about the cardinality (minimum cardinality, maximum cardinality, exact cardinality, functionality) of a property can be exploited to formulate better query plans, even if data statistics are incomplete, missing or erroneous.</p> +<p>Similarly, taking into account class hierarchies, or the definition of classes/properties via set theoretic constructs (union, intersection) at the schema level, can provide valuable information on the selectivity of certain triple patterns, thus facilitating the process of query optimization. Similar effects can be achieved using information about properties (functionality, transitivity, symmetry etc).</p> +<p>As an example of these patterns, consider the query below, where class <code>&lt;C&gt;</code> is defined as the intersection of classes <code>&lt;C1&gt;</code>,<code> &lt;C2&gt;</code>. Thus, the triple pattern <code>(?x rdf:type &lt;C&gt;)</code> is more selective than <code>(?y rdf:type &lt;C1&gt;)</code> and <code>(?z rdf:type &lt;C2&gt;)</code> and this should be immediately recognizable by the optimizer, without having to resort to cost estimations. This example shows also how unnecessary triple patterns can be pruned from a query to reduce the number of necessary joins. Figure 1 illustrates the query plan obtained when the OWL intersectionOf construct is used.</p> +<pre tabindex="0"><code>SELECT ?x +WHERE { ?x rdf: type &lt;C&gt; . ?x &lt;P1 &gt; ?y . + ?y rdf : type &lt;C1 &gt; . ?y &lt;P2 &gt; ?z . ?z rdf : type &lt;C2 &gt; } +</code></pre><p><img src="owl_constraints.png" alt="image"></p> +<p>Schema information can also be used by the query optimizer to rewrite SPARQL queries to equivalent ones that are found in a form for which already known optimization techniques are easily applicable. For example, the query below could easily be transformed into a classical star-join query if we know (from the schema) that property <code>P4</code> is a symmetric property.</p> +<pre tabindex="0"><code>SELECT ?y ?y1 ?y2 ?y3 +WHERE { ?x &lt;P1 &gt; ?y . ?x &lt;P2 &gt; ?y1 . + ?x &lt;P3 &gt; ?y2 . ?y3 &lt;P4 &gt; ?x } +</code></pre><h3 id="conclusion">Conclusion</h3> +<p>In this post we argued that OWL-empowered optimization techniques can be beneficial for SPARQL query optimization when used in tandem with standard heuristics based on statistics. We provided some examples which showed the power of such optimizations in various cases, namely:</p> +<ul> +<li>Cases where the search space can be pruned due to the schema and the associated constraints; an extreme special sub-case is the identification of queries that violate schema constraints and thus produce no results.</li> +<li>Cases where the schema can help in the estimation of triple pattern selectivity, even if statistics are incomplete or missing.</li> +<li>Cases where the schema can identify redundant triple patterns that do not affect the result and can be safely eliminated from the query.</li> +<li>Cases where the schema can be used for rewriting a query in an equivalent form that would facilitate optimization using well-known optimization techniques.</li> +</ul> +<p>This list is by no means complete, as further cases can be identified by optimizers. Our aim in this post was not to provide a complete listing, but to demonstrate the potential of the idea in various directions.</p> + + + + + Person Activity Subgraph Features in LDBC DATAGEN + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + Wed, 04 Feb 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/person-activity-subgraph-features-in-ldbc-datagen/ + <p>When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other elements such as text images or even video assets. More importantly, all these elements form other subgraphs within the social network! For example, the person activity subgraph is composed by posts and their replies in the different forums/groups in a social network, and has a tree-like structure connecting people through their message interactions.</p> +<p>When looking at the LDBC Social Network Benchmark (SNB) and its interactive workload, one realizes that these other subgraphs, and especially the person activity subgraph, play a role even more important than that played by the friendship subgraph. Just two numbers that illustrate this importance: 11 out of the 14 interactive workload queries needs traversing parts of the person activity subgraph, and about 80% of all the generated data by DATAGEN belongs to this subgraph. As a consequence, a lot of effort has been devoted to make sure that the person activity subgraph is realistic enough to fulfill the needs of the benchmark. In the rest of this post, I will discuss some of the features implemented in DATAGEN that make the person activity subgraph interesting.</p> +<h3 id="reaslistic-message-content">Reaslistic Message Content</h3> +<p>Messages&rsquo; content in DATAGEN is not random, but contains snippets of text extracted from Dbpedia talking about the tags the message has. Furthermore, not all messages are the same size, depending on whether they are posts or replies to them. For example, the size of a post is selected uniformly between a minimum and a maximum, but also, there is a small probability that the content is very large (about 2000 characters). In the case of commets (replies to posts), there is a probability of 0.66 to be very short (“ok”, “good”, “cool”, “thanks”, etc.). Moreover, in real forum conversations, it is tipical to see conversations evolving from one topic to another. For this reason, there is a probability that the tags of comments replying posts to change during the flow of the conversation, moving from post&rsquo;s tags to other related or randomly selected tags.</p> +<h3 id="non-uniform-activity-levels">Non uniform activity levels</h3> +<p>In a real social network, not all the members show the same level of activity. Some people post messages more sporadically than others, whose activity is significantly higher. DATAGEN reproduces this phenomena by correlating the activity level with the amount of friends the person has. That is, the larger the amount of friends a person has, the larger the number of posts it creates, and also, the larger the number of groups it belongs to.</p> +<h3 id="time-correlated-post-and-comment-generation">Time correlated post and comment generation</h3> +<p>In a real social network, user activity is driven by real world events such as sport events, elections or natural disasters, just to cite a few of them. For this reason, we observe spikes of activity around these events, where the amount of messages created increases significantly during a short period of time, reaching a maximum and then decreasing. DATAGEN emulates this behavior by generating a set of real world events about specific tags. Then, when dates of posts and comments are generated, these events are taken into account in such a way that posts and comments are clustered around them. Also not all the events are equally relevant, thus having spikes larger than others. The shape of the activity is modeled following the model described in <a href="#references">[1]</a>. Furthermore, in order to represent the more normal and uniform person activity levels, we also generate uniformly distributed messages along the time line. The following figure shows the user activity volume along the time line.</p> +<p><img src="1.png" alt="image"></p> +<p>As we see, the timeline contains spikes of activity, instead of being uniform. Note that the generally increasing volume activity is due to the fact that more people is added to the social network as time advances.</p> +<p>In this post we have reviewed several interesting characteristics of the person activity generation process in DATAGEN. Stay tuned for future blog posts about this topic.</p> +<h4 id="references">References</h4> +<p>[1] Leskovec, J., Backstrom, L., &amp; Kleinberg, J. (2009, June). Meme-tracking and the dynamics of the news cycle. In <em>Proceedings of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining</em> (pp. 497-506). ACM.</p> + + + + + SNB Driver - Part 2: Tracking Dependencies Between Queries + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + Fri, 23 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries/ + <p>The <a href="https://ldbcouncil.org/post/snb-driver-part-1">SNB Driver part 1</a> post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we&rsquo;ll drill down deeper into the details of what it means to execute &ldquo;dependent queries&rdquo; during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in detail, it is just there to serve as a point of reference.</p> +<h3 id="definitions">Definitions</h3> +<ul> +<li> +<p><em>Simulation Time (ST)</em>: notion of time created by data generator. All time stamps in the generated data set are in simulation time</p> +</li> +<li> +<p><em>Real Time (RT)</em>: wall clock time</p> +</li> +<li> +<p><em>Time Compression Ratio</em>: function that maps simulation time to real time, e.g., an offset in combination with a compression ratio. It is a static value, set in driver configuration. Real Time Ratio is reported along with benchmark results, allowing others to recreate the same benchmark</p> +</li> +<li> +<p><em>Operation</em>: read and/or write</p> +</li> +<li> +<p><em>Dependencies</em>: operations in this set introduce dependencies in the workload. That is, for every operation in this set there exists at least one other operation (in Dependents) that can not be executed until this operation has been processed</p> +</li> +<li> +<p><em>Dependents</em>: operations in this set are dependent on at least one other operation (in Dependencies) in the workload</p> +</li> +<li> +<p><em>Due Time (DueT)</em>: point in simulation time at which the execution of an operation should be initiated.</p> +</li> +<li> +<p><em>Dependent Time (DepT)</em>: in addition to Due Time, every operation in Dependents also has a Dependent Time, which corresponds to the Due Time of the operation that it depends on. Dependent Time is always before Due Time. For operations with multiple dependencies Dependent Time is the maximum Due Time of all the operations it depends on.</p> +</li> +<li> +<p><em>Safe Time (SafeT)</em>: time duration.</p> +<ul> +<li> +<p>when two operations have a necessary order in time (i.e., dependency) there is at least a SafeT interval between them</p> +</li> +<li> +<p>SafeT is the minimum duration between the Dependency Time and Due Time of any operations in Dependents</p> +</li> +</ul> +</li> +<li> +<p>​<em>Operation Stream</em>: sequence of operations ordered by Due Time (dependent operations must separated by at least SafeT)</p> +</li> +<li> +<p><em>Initiated Operations</em>: operations that have started executing but not yet finished</p> +</li> +<li> +<p><em>Local Completion Time (per driver)</em>: point in simulation time behind which there are no uncompleted operationsLocal Completion Time = min(min(Initiated Operations), max(Completed Operations))</p> +</li> +<li> +<p><em>Global Completion Time (GCT)</em>: minimum completion time of all drivers. Once GCT has advanced to the Dependent Time of some operation that operation is safe to execute, i.e., the operations it depends on have all completed executing. Global Completion Time = min(Local Completion Time)​</p> +</li> +<li> +<p><em>Execution Window (Window)</em>: a timespan within which all operations can be safely executed</p> +<ul> +<li> +<p>All operations satisfying window.startTime &lt;= operation.DueT &lt; window.endTime may be executed</p> +</li> +<li> +<p>Within a window no restrictions on operation ordering or operation execution time are enforced, driver has a freedom of choosing an arbitrary scheduling strategy inside the window</p> +</li> +<li> +<p>To ensure that execution order respects dependencies between operations, window size is bounded by SafeT, such that: 0 &lt; window.duration &lt;= SafeT</p> +</li> +<li> +<p>Window duration is fixed, per operation stream; this is to simplify scheduling and make benchmark runs repeatable</p> +</li> +<li> +<p>Before any operations within a window can start executing it is required that: GCT &gt;= window.startTime - (SafeT - window.duration)</p> +</li> +<li> +<p>All operations within a window must initiate and complete between window start and end times: window.startTime &lt;= operation.initiate &lt; window.endTime and window.startTime &lt;= operation.complete &lt; window.endTime</p> +</li> +</ul> +</li> +<li> +<p><em>Dependency Mode</em>: defines dependencies, constraints on operation execution order</p> +</li> +<li> +<p><em>Execution Mode</em>: defines how the runtime should execute operations of a given type</p> +</li> +</ul> +<h3 id="tracking-dependencies">Tracking Dependencies</h3> +<p>Now, the fun part, making sure dependent operations are executed in the correct order.</p> +<p>Consider that every operation in a workload belongs to none, one, or both of the following sets: Dependencies and Dependents. As mentioned, the driver uses operation time stamps (Due Times) to ensure that dependencies are maintained. It keeps track of the latest point in time behind which every operation has completed. That is, every operation (i.e., dependency) with a Due Time lower or equal to this time is guaranteed to have completed execution. It does this by maintaining a monotonically increasing variable called Global Completion Time (GCT).</p> +<p>Logically, every time the driver (via a database connector) begins execution of an operation from Dependencies that operation is added to Initiated Operations:</p> +<ul> +<li>the set of operations that have started executing but not yet finished.</li> +</ul> +<p>Then, upon completion, the operation is removed from Initiated Operations and added to Completed Operations:</p> +<ul> +<li>the set of operations that have started and finished executing.</li> +</ul> +<p>Using these sets, each driver process maintains its own view of GCT in the following way. Local progress is monitored and managed using a variable called Local Completion Time (LCT):</p> +<ul> +<li>the point in time behind which there are no uncompleted operations. No operation in Initiated Operations has a lower or equal Due Time and no operation in Completed Operations has an equal or higher Due Time.</li> +</ul> +<p>LCT is periodically sent to all other driver processes, which all then (locally) set their view of GCT to the minimum LCT of all driver processes. At this point the driver has two, of the necessary three (third covered shortly), pieces of information required for knowing when to execute an operation:</p> +<ul> +<li> +<p><em>Due Time</em>: point in time at which an operation should be executed, assuming all preconditions (e.g., dependencies) have been fulfilled</p> +</li> +<li> +<p><em>GCT</em>: every operation (from Dependencies) with a Due Time before this point in time has completed execution</p> +</li> +</ul> +<p>However, with only GCT to track dependencies the driver has no way of knowing when it is safe to execute any particular dependent operation. What GCT communicates is that all dependencies up to some point in time have completed, but whether or not the dependencies for any particular operation are within these completed operations is unknown. The driver would have to wait until GCT has passed the Due Time (because Dependency Time is always lower) of an operation before that operation could be safely executed, which would result in the undesirable outcome of every operation missing its Due Time. The required information is which particular operation in Dependencies does any operation in Dependents depend on. More specifically, the Due Time of this operation. This is referred to as Dependent Time:</p> +<ul> +<li>in addition to Due Time, every operation in Dependents also has (read: must have) a Dependent Time, which corresponds to the latest Due Time of all the operations it depends on. Once GCT has advanced beyond the Dependent Time of an operation that operation is safe to execute.</li> +</ul> +<p>Using these three mechanisms (Due Time, GCT, and Dependent Time) the driver is able to execute operations, while ensuring their dependencies are satisfied beforehand.</p> +<h3 id="scalable-execution-in-the-presence-of-dependencies">Scalable execution in the Presence of Dependencies</h3> +<p>The mechanisms introduced in part 1 guarantee that dependency constraints are not violated, but in doing so they unavoidably introduce overhead of communication/synchronization between driver threads/processes. To minimize the negative effects that synchronization has on scalability an additional Execution Mode was introduced (more about Execution Modes will be discussed shortly): Windowed Execution. Windowed Execution has two design goals:</p> +<p>a) make the generated load less &lsquo;bursty&rsquo;</p> +<p>b) allow the driver to &lsquo;scale&rsquo;, so when the driver is given more resources (CPUs, servers, etc.) it is able to generate more load.</p> +<p>In the context of Windowed Execution, operations are executed in groups (Windows), where operations are grouped according to their Due Time. Every Window has a Start Time, a Duration, and an End Time, and Windows contain only those operations that have a Due Time between Window.startTime and Window.endTime. Logically, all operations within a Window are executed at the same time, some time within the Window. No guaranty is made regarding exactly when, or in what order, an operation will execute within its Window.</p> +<p>The reasons this approach is correct are as follows:</p> +<ul> +<li> +<p>Operations belonging to the Dependencies set are never executed in this manner - the Due Times of Dependencies operations are never modified as this would affect how dependencies are tracked</p> +</li> +<li> +<p>The minimum duration between the Dependency Time and Due Time of any operation in Dependents is known (can be calculated by scanning through workload once), this duration is referred to as Safe Time (SafeT)</p> +</li> +<li> +<p>A window does not start executing until the dependencies of all its operations have been fulfilled. This is ensured by enforcing that window execution does not start until</p> +<p>GCT &gt;= window.startTime - (SafeT - window.duration) = window.endTime - SafeT; that is, the duration between GCT and the end of the window is no longer than SafeT</p> +</li> +</ul> +<p>The advantages of such an execution mode are as follows:</p> +<ul> +<li> +<p>As no guarantees are made regarding time or order of operation execution within a Window, GCT no longer needs to be read before the execution of every operation, only before the execution of every window</p> +</li> +<li> +<p>Then, as GCT is read less frequently, it follows that it does not need to be communicated between driver processes as frequently. There is no need or benefit to communicating GCT protocol message more frequently than approximately Window.duration, the side effect of which is reduced network traffic</p> +</li> +<li> +<p>Further, by making no guarantees regarding the order of execution the driver is free to reschedule operations (within Window bounds). The advantage being that operations can be rearranged in such a way as to reduce unwanted bursts of load during execution, which could otherwise occur while synchronizing GCT during demanding workloads. For example, a uniform scheduler may modify operation Due Times to be uniformly distributed across the Window timespan, to &lsquo;smoothen&rsquo; the load within a Window.</p> +</li> +</ul> +<p>As with any system, there are trade-offs to this design, particularly regarding Window.duration. The main trade-off is that between &lsquo;workload resolution&rsquo; and scalability. Increasing Window.duration reduces synchronization but also reduces the resolution at which the workload definition is followed. That is, the generated workload becomes less like the workload definition. However, as this is both bounded and configurable, it is not a major concern. This issue is illustrated in Figure 1, where the same stream of events is split into two different workloads based on different size of the Window. The workload with Window size 5 (on the right) has better resolution, especially for the &lsquo;bursty&rsquo; part of the event stream.</p> +<p><img src="window-scheduling.png" alt="image"><br> +Figure 1. Window scheduling</p> +<p>This design also trades a small amount of repeatability for scalability: as there are no timing or ordering guarantees within a window, two executions of the same window are not guaranteed to be equivalent - &lsquo;what happens in the window stays in the window&rsquo;. Despite sacrificing this repeatability, the results of operations do not change. No dependency-altering operations occur during the execution of a Window, therefore results for all queries should be equivalent between two executions of the same workload, there is no effect on the expected result for any given operation.</p> + + + + + SNB Driver - Part 3: Workload Execution Putting It All Together + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + Tue, 20 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-3-workload-execution-putting-it-all-together/ + <p>Up until now we have introduced the <a href="https://ldbcouncil.org/post/snb-driver-part-1">challenges faced when executing the LDBC SNB benchmark</a>, as well as explained <a href="https://ldbcouncil.org/post/snb-driver-part-2-tracking-dependencies-between-queries">how some of these are overcome</a>. With the foundations laid, we can now explain precisely how operations are executed.</p> +<p>Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these classifications the driver runtime then knows how each operation should be executed. These modes, as well as what they mean to the driver runtime, are described below.</p> +<h3 id="dependency-modes">Dependency Modes</h3> +<p>While executing a workload the driver treats operations differently, depending on their Dependency Mode. In the previous section operations were categorized by whether or not they are in the sets Dependencies and/or Dependents.</p> +<p>Another way of communicating the same categorization is by assigning a Dependency Mode to operations - every operation type generated by a workload definition must be assigned to exactly one Dependency Mode. Dependency modes define dependencies, constraints on operation execution order. The driver supports a number of different Dependency Modes: None, Read Only, Write Only, Read Write. During workload execution, operations of each type are treated as follows:</p> +<p><strong>• None</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>– Prior Execution: do nothing – After Execution: do nothing</p> +<p><strong>• Read Only</strong></p> +<p>Depended On (NO): operations do not introduce dependencies with other operations (i.e., the correct execution of no other operation depends on these operations to have completed executing)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: wait for GCT &gt;= operation.DepTime – After Execution: do nothing</p> +<p><strong>• Write Only</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (NO): operation execution does not depend on GCT to have advanced sufficiently (i.e., correct execution of these operations does not depend on any other operations to have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<p><strong>• Read Write</strong></p> +<p>Depended On (YES): operations do introduce dependencies with other operations (i.e., the correct execution of certain other operations requires that these operations to have completed executing, i.e., to advance GCT)</p> +<p>Dependent On (YES): operation execution does depend on GCT to have advanced sufficiently (i.e., correct execution of these operations requires that certain operations have completed execution)</p> +<p>– Prior Execution: add operation to Initiated Operations, wait for GCT &lt; operation.DepT</p> +<p>– After Execution: remove operation from Initiated Operations, add operation to Completed Operations</p> +<h3 id="execution-modes">Execution Modes</h3> +<p>Execution Modes relate to how operations are scheduled, when they are executed, and what their failure conditions are. Each operation type in a workload definition must be assigned to exactly one Execution Mode. The driver supports a number of different Execution Modes: Asynchronous, Synchronous, Partially Synchronous. It splits a single workload operation stream into multiple streams, zero or more steams per Execution Mode. During workload execution, operations from each of these streams are treated as follows.</p> +<p><strong>• Asynchronous</strong>: operations are executed individually, when their Due Time arrives.</p> +<p>Motivation: This is the default execution mode, it executes operations as true to the workload definition as possible.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler – Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: unbounded</p> +<p>– Max Execution Time: unbounded</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay</p> +<p><strong>• Synchronous</strong>: operations are executed individually, sequentially, in blocking manner.</p> +<p>Motivation: Some dependencies are difficult to capture efficiently with SafeT and GCT alone. For example, social applications often support conversations via posts and likes, where likes depend on the existence of posts. Furthermore, posts and likes also depend on the existence of the users that make them. However, users are created at a lower frequency than posts and likes, and it can be assumed they do not immediately start creating content. As such, a reasonably long SafeT can be used between the creation of a user and the first time that user creates posts or likes. Conversely, posts are often replied to and/or liked soon after their creation, meaning a short SafeT would be necessary to maintain the ordering dependency. Consequently, maintaining the dependencies related to conversations would require a short SafeT, and hence a small window. This results in windows containing fewer operations, leading to less potential for parallelism within windows, less freedom in scheduling, more synchronization, and greater likelihood of bursty behavior - all negative things.</p> +<p>The alternative offered by Synchronous Execution is that, when practical, operations of certain types can be partitioned (e.g. posts and likes could be partitioned by the forum in which they appear), and partitions assigned to driver processes. Using the social application example from above, if all posts and likes were partitioned by forum the driver process that executes the operations from any partition could simply execute them sequentially. Then the only dependency to maintain would be on user operations, reducing synchronization dramatically, and parallelism could still be achieved as each partition would be executed independently, in parallel, by a different driver process.</p> +<p>– Re-scheduling Before Execution: None: operation.DueT not modified by scheduler</p> +<p>– Execute When time &gt;= operation.DueT and previousOperation.completed == true (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: 1</p> +<p>– Max Execution Time: nextOperation.DueT - operation.DueT</p> +<p>– Failure: operation execution starts later than: operation.DueT Tolerated Delay E.g., if previousOperation did not complete in time, forcing current operation to wait for longer than the tolerated-delay</p> +<p><strong>• Partially Synchronous</strong> (Windowed Execution, described in Section 3.4 in more details), groups of operations from the same time window are executed together</p> +<p>– Re-scheduling Before Execution: Yes, as long as the following still holds:</p> +<p>window.startTime &lt;= operation.DueT &lt; window.startTime + window.duration</p> +<p>Operations within a window may be scheduled in any way, as long as they remain in the window from which they originated: their Due Times, and therefore ordering, may be modified</p> +<p>– Execute When time &gt;= operation.DueT (and GCT &gt;= operation.DepT)</p> +<p>– Max Concurrent Executions: number of operations within window</p> +<p>– Max Execution Time: (window.startTime + window.duration) - operation.DueT</p> +<p>– Failure: operation execution starts later than: window.startTime window.duration operation execution does not finish by: window.startTime + window.duration</p> +<h3 id="tying-it-back-to-ldbc-snb">Tying it back to LDBC SNB</h3> +<p>The driver was designed to execute the workload of LDBC SNB. As discussed, the main challenge of running queries in parallel on graph-shaped data stem from dependencies introduced by the graph structure. In other words, workload partitioning becomes as hard as graph partitioning.</p> +<p>The LDBC SNB data can in fact be seen as a union of two parts:</p> +<ol> +<li> +<p>Core Data: relatively small and dense friendship graph (not more than 10% of the data). Updates on this part are very hard to partition among driver threads, since the graph is essentially a single dense strongly connected component.</p> +</li> +<li> +<p>User Activity Data: posts, replies, likes; this is by far the biggest part of the data. Updates on this part are easily partitioned as long as the dependencies with the &ldquo;core&rdquo; part are satisfied (i.e., users don&rsquo;t post things before the profiles are created, etc.).</p> +</li> +</ol> +<p>In order to avoid friendship graph partitioning, the driver introduces the concept SafeT, the minimal simulation time that should pass between two dependent events.</p> +<p>This property is enforced by the data generator, i.e. the driver does not need to change or delay some operations in order to guarantee dependency safety. Respecting dependencies now means globally communicating the advances of the Global Completion Time, and making sure the operations do not start earlier than SafeT from their dependents.</p> +<p>On the other hand, the driver exploits the fact that some of the dependencies in fact do not hinder partitioning: although replies to the post can only be sent after the post is created, these kinds of dependencies are satisfied if we partition workload by forums. This way, all (update) operations on posts and comments from one forum are assigned to one driver thread. Since there is typically a lot of forums, each driver thread gets multiple ones. Updates from one forum are then run in Synchronous Execution Mode, and parallelism is achieved by running many distinct forums in parallel. By doing so, we can add posts and replies to forums at very high frequency without the need to communicate the GCT across driver instances (i.e. we efficiently create the so-called flash-mob effects in the posting/replying workload).</p> + + + + + Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + Tue, 13 Jan 2015 00:00:00 +0000 + + https://ldbcouncil.org/post/running-the-semantic-publishing-benchmark-on-sesame-a-step-by-step-guide/ + <p>Until now we have discussed several aspects of the <a href="https://ldbcouncil.org/benchmarks/spb">Semantic Publishing Benchmark (SPB)</a> such as the <a href="https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark">difference in performance between virtual and real servers configuration</a>, how to choose an <a href="https://ldbcouncil.org/post/making-semantic-publishing-execution-rules">appropriate query mix</a> for a benchmark run and our experience with using SPB in the development process of GraphDB for <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">finding performance issues</a>.</p> +<p>In this post we provide a step-by-step guide on how to run SPB using the <a href="http://rdf4j.org/">Sesame</a> RDF data store on a fresh install of <a href="http://releases.ubuntu.com/14.04.1/">Ubuntu Server 14.04.1</a>. The scenario is easy to adapt to other RDF triple stores which support the Sesame Framework used for querying and analyzing RDF data.</p> +<h3 id="prerequisites">Prerequisites</h3> +<p>We start with a fresh server installation, but before proceeding with setup of the Sesame Data Store and SPB benchmark we need the following pieces of software up and running:</p> +<ul> +<li>Git</li> +<li>Apache Ant 1.8 or higher</li> +<li>OpenJDK 6 or Oracle JDK 6 or higher</li> +<li>Apache Tomcat 7 or higher</li> +</ul> +<p>If you already have these components installed on your machine you can directly proceed to the next section: <em>Installing Sesame</em></p> +<p>Following are sample commands which can be used to install the required software components:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo apt-get install git +</span></span><span style="display:flex;"><span>sudo apt-get install ant +</span></span><span style="display:flex;"><span>sudo apt-get install default-jdk +</span></span><span style="display:flex;"><span>sudo apt-get install tomcat7 +</span></span></code></pre></div><p>Optionally Apache Tomcat Server can be downloaded as a zipped file and extracted in a location of choice.</p> +<p>After a successful installation of Apache Tomcat you should be able to get the default splash page <em>“It works”</em> when you open your web browser and enter the following address: http://&lt;your_ip_address&gt;:8080</p> +<h3 id="installing-sesame">Installing Sesame</h3> +<p>We will use current Sesame version 2.7.14. You can download it <a href="http://sourceforge.net/projects/sesame/files/Sesame%202/">here</a> or run following command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>wget <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> <span style="color:#e6db74">&#34;http://sourceforge.net/projects/sesame/files/Sesame%202/2.7.14/openrdf-sesame-2.7.14-sdk.tar.gz/download&#34;</span> <span style="color:#ae81ff">\\</span> +</span></span><span style="display:flex;"><span> -O openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>Then extract the Sesame tarball:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>tar -xvzf openrdf-sesame-2.7.14-sdk.tar.gz +</span></span></code></pre></div><p>To deploy sesame you have to copy the two war files that are in <em>openrdf-sesame-2.7.14/war</em> to <em>/var/lib/tomcat7/webapps</em></p> +<p>From <em>openrdf-sesame-2.7.14/war</em> you can do it with command:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>cp openrdf-*.war &lt;tomcat_install&gt;/webapps +</span></span></code></pre></div><p>Sesame applications write and store configuration files in a single directory and the tomcat server needs permissions for it.</p> +<p>By default the configuration directory is: <em>/usr/share/tomcat7/.aduna</em></p> +<p>Create the directory:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo mkdir /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Then change the ownership:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chown tomcat7 /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>And finally you should give the necessary permissions:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>sudo chmod o+rwx /usr/share/tomcat7/.aduna +</span></span></code></pre></div><p>Now when you go to: http://&lt;your_ip_address&gt;:8080/openrdf-workbench/repositories</p> +<p>You should get a screen like this:</p> +<p><img src="01-Sesame-repo-list.png" alt="image"></p> +<h3 id="setup-spb">Setup SPB</h3> +<p>You can download the SPB code and find brief documentation on GitHub:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm">https://github.com/ldbc/ldbc_spb_bm</a></p> +<p>A detailed documentation is located here:</p> +<p><a href="https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf">https://github.com/ldbc/ldbc_spb_bm/blob/master/doc/LDBC_SPB_v0.3.pdf</a></p> +<p>SPB offers many configuration options which control various features of the benchmark e.g.:</p> +<ul> +<li>query mixes</li> +<li>dataset size</li> +<li>loading datasets</li> +<li>number of agents</li> +<li>validating results</li> +<li>test conformance to OWL2-RL ruleset</li> +<li>update rate of agents</li> +</ul> +<p>Here we demonstrate how to generate a dataset and execute a simple test<br> +run with it.</p> +<p>First download the SPB source code from the repository:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>git clone https://github.com/ldbc/ldbc_spb_bm.git +</span></span></code></pre></div><p>Then in the ldbc_spb_bm directory build the project:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>ant build-basic-querymix +</span></span></code></pre></div><p>If you simply execute the command:</p> +<pre tabindex="0"><code>ant +</code></pre><p>you’ll get a list of all available build configurations for the SPB test driver, but for the purpose of this step-by-step guide, configuration shown above is sufficient.</p> +<p>Depending on generated dataset size a bigger java heap size may be required for the Sesame Store. You can change it by adding following arguments to Tomcat&rsquo;s startup files e.g. in <em>catalina.sh</em>:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>export JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;-d64 -Xmx4G&#34;</span> +</span></span></code></pre></div><p>To run the Benchmark you need to create a repository in the Sesame Data Store, similar to the following screenshot:</p> +<p><img src="02-Sesame-create-repo.png" alt="image"></p> +<p>Then we need to point the benchmark test driver to the SPARQL endpoint of that repository. This is done in <em>ldbc_spb_bm/dist/test.properties</em> file.</p> +<p>The default value of <em>datasetSize</em> in the properties is set to be 10M, but for the purpose of this guide we will decrease it to 1M.</p> +<p>You need to change</p> +<pre tabindex="0"><code>datasetSize=1000000 +</code></pre><p>Also the URLs of the SPARQL endpoint for the repository</p> +<pre tabindex="0"><code>endpointURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1 +endpointUpdateURL=http://localhost:8080/openrdf-sesame/repositories/ldbc1/statements +</code></pre><p>First step, before measuring the performance of a triple store, is to load the reference-knowledge data, generate a 1M dataset, load it into the repository and finally generate query substitution parameters.</p> +<p>These are the settings to do that, following parameters will &lsquo;instruct&rsquo; the SPB test driver to perform all the actions described above:</p> +<pre tabindex="0"><code>#Benchmark Operational Phases +loadOntologies=true +loadReferenceDatasets=true +generateCreativeWorks=true +loadCreativeWorks=true +generateQuerySubstitutionParameters=true +validateQueryResults=false +warmUp=false +runBenchmark=false +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>To run the benchmark execute the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>java -jar semantic_publishing_benchmark-basic-standard.jar +</span></span><span style="display:flex;"><span>test.properties +</span></span></code></pre></div><p>When the initial run has finished, we should have a 1M dataset loaded into the repository and a set of files with query substitution parameters.</p> +<p>Next we will measure the performance of Sesame Data Store by changing some configuration properties:</p> +<pre tabindex="0"><code>#Benchmark Configuration Parameters +warmupPeriodSeconds=60 +benchmarkRunPeriodSeconds=300 +... +#Benchmark Operational Phases +loadOntologies=false +loadReferenceDatasets=false +generateCreativeWorks=false +loadCreativeWorks=false +generateQuerySubstitutionParameters=false +validateQueryResults=false +warmUp=true +runBenchmark=true +runBenchmarkOnlineReplicationAndBackup=false +checkConformance=false +</code></pre><p>After the benchmark test run has finished result files are saved in folder: <em>dist/logs</em></p> +<p>There you will find three types of results: the result summary of the benchmark run (<em>semantic_publishing_benchmark_results.log),</em> brief results and detailed results.</p> +<p>In <em>semantic_publishing_benchmark_results.log</em> you will find the results distributed per seconds. They should be similar to the listing bellow:</p> +<p>Benchmark Results for the 300-th second</p> +<pre tabindex="0"><code>Seconds : 300 (completed query mixes : 0) + Editorial: + 2 agents + + 9 inserts (avg : 22484 ms, min : 115 ms, max : 81389 ms) + 0 updates (avg : 0 ms, min : 0 ms, max : 0 ms) + 0 deletes (avg : 0 ms, min : 0 ms, max : 0 ms) + + 9 operations (9 CW Inserts (0 errors), 0 CW Updates (1 errors), 0 CW Deletions (2 errors)) + 0.0300 average operations per second + + Aggregation: + 8 agents + + 2 Q1 queries (avg : 319 ms, min : 188 ms, max : 451 ms, 0 errors) + 3 Q2 queries (avg : 550 ms, min : 256 ms, max : 937 ms, 0 errors) + 1 Q3 queries (avg : 58380 ms, min : 58380 ms, max : 58380 ms, 0 errors) + 2 Q4 queries (avg : 65250 ms, min : 40024 ms, max : 90476 ms, 0 errors) + 1 Q5 queries (avg : 84220 ms, min : 84220 ms, max : 84220 ms, 0 errors) + 2 Q6 queries (avg : 34620 ms, min : 24499 ms, max : 44741 ms, 0 errors) + 3 Q7 queries (avg : 5892 ms, min : 4410 ms, max : 8528 ms, 0 errors) + 2 Q8 queries (avg : 3537 ms, min : 546 ms, max : 6528 ms, 0 errors) + 4 Q9 queries (avg : 148573 ms, min : 139078 ms, max : 169559 ms, 0 errors) +</code></pre><p>This step-by-step guide gave an introduction on how to setup and run the SPB on a Sesame Data Store. Further details can be found in the reference documentation listed above.</p> +<p>If you have any troubles running the benchmark, don&rsquo;t hesitate to comment or use our social media channels.</p> +<p>In a future post we will go through some of the parameters of SPB and check their performance implications.</p> + + + + + Semantic Publishing Instance Matching Benchmark + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + Tue, 30 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/semantic-publishing-instance-matching-benchmark/ + <p>The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.</p> +<p>The SPIMBench test cases provide a systematic way for testing the performance of instance matching systems in different settings. SPIMBench supports the types of test cases already adopted by existing instance matching benchmarks:</p> +<ul> +<li>value-based test cases based on applying value transformations (e.g., blank character addition and deletion, change of date format, abbreviations, synonyms) on triples relating to given input entity</li> +<li>structure-based test cases characterized by a structural transformation (e.g., different nesting levels for properties, property splitting, aggregation)</li> +</ul> +<p>The novelty of SPIMBench lies in the support for the following semantics-aware test cases defined on the basis of OWL constructs:</p> +<ul> +<li>instance (in)equality (owl:sameAs, owl:differentFrom)</li> +<li>class and property equivalence (owl:equivalentClass, owl:equivalentProperty)</li> +<li>class and property disjointness (owl:disjointWith, owl:AllDisjointClasses, owl:propertyDisjointWith, owl:AllDisjointProperties)</li> +<li>class and property hierarchies (rdfs:subClassOf, rdfs:subPropertyOf)</li> +<li>property constraints (owl:FunctionalProperty, owl:InverseFunctionalProperty)</li> +<li>complex class definitions (owl:unionOf, owl:intersectionOf)</li> +</ul> +<p>SPIMBench uses and extends the ontologies of LDBC&rsquo;s Semantic Publishing Benchmark (SPB) to tackle the more complex schema constructs expressed in terms of OWL. It also extends SPB&rsquo;s data generator to first generate a synthetic source dataset that does not contain any matches, and then to generate matches and non-matches to entities of the source dataset to address the supported transformations and OWL constructs. The data generation process allows the creation of arbitrary large datasets, thus supporting the evaluation of both the scalability and the matching quality of an instance matching system.</p> +<p>Value and structure-based test cases are implemented using the SWING framework <a href="#references">[1]</a> on data and object type properties respectively. These are produced by applying the appropriate transformation(s) on a source instance to obtain a target instance. Semantics-based test cases are produced in the same way as with the value and structure-based test cases with the difference that appropriate triples are constructed and added in the target dataset to consider the respective OWL constructs.</p> +<p>SPIMBench, in addition to the semantics-based test cases that differentiate it from existing instance matching benchmarks, also offers a weighted gold standard used to judge the quality of answers of instance matching systems. It contains generated matches (a pair consisting of an entity of the source dataset and an entity of the target dataset) the type of test case it represents, the property on which a transformation was applied (in the case of value-based and structure-based test cases), and a weight that quantifies how easy it is to detect this match automatically. SPIMBench adopts an information-theoretical approach by applying multi-relational learning to compute the weight of the pair of matched instances by measuring the information loss that results from applying transformations to the source data to generate the target data. This detailed information, which is not provided by state of the art benchmarks, allows users of SPIMBench (e.g., developers of IM systems) to more easily identify the reasons underlying the performance results obtained using SPIMBench and thereby supports the debugging of instance matching systems.</p> +<p>SPIMBench can be downloaded from <a href="https://github.com/jsaveta/SPIMBench">our repository</a> and a more thorough description thereof can be found on <a href="http://www.ics.forth.gr/isl/spimbench/">http://www.ics.forth.gr/isl/spimbench/</a>.</p> +<h4 id="references">References</h4> +<p>[1] A. Ferrara, S. Montanelli, J. Noessner, and H. Stuckenschmidt. Benchmarking Matching Applications on the Semantic Web. In ESWC, 2011.</p> + + + + + Further Developments in SNB BI Workload + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + Thu, 18 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/further-developments-in-snb-bi-workload/ + <p>We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.</p> +<p>As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.</p> +<p>There are obvious marketing applications for a SNB-like dataset. There are also security related applications, ranging from fraud detection to intelligence analysis. The latter category is significant but harder to approach, as much of the detail of best practice is itself not in the open. In this post, I will outline some ideas discussed over time that might cristallize into a security related section in the SNB BI workload. We invite comments from practitioners for making the business questions more relevant while protecting sensitive details.</p> +<p>Let’s look at what scenarios would fit with the dataset. We have people, different kinds of connections between people, organizations, places and messages. Messages (posts/replies), people and organizations are geo-tagged. Making a finer level of geo-tagging, with actual GPS coordinates, travel itineraries etc, all referring to real places would make the data even more interesting. The geo dimension will be explored separately in a forthcoming post.</p> +<p>One of the first things to appear when approaching the question isthat the analysis of behavior patterns over time is not easily captured in purely declarative queries. For example, temporal sequence of events and the quantity and quality of interactions between players leads to intractably long queries which are hard to understand and debug. Therefore, views and intermediate materializations become increasingly necessary.</p> +<p>Another feature of the scene is that information is never complete. Even if logs are complete for any particular system, there are always possible interactions outside of the system. Therefore we tend to get match scores more then strictly Boolean conditions. Since everybody is related to everybody else via a relative short path, the nature and stremgth of the relationship is key to interpreting its significance.</p> +<p>Since a query consisting of scores and outer joins only is difficult to interpret and optimize, and since the information is seldom complete, some blanks may have to be filled in by guesses. The database must therefore contain metadata about this.</p> +<p>An orthogonal aspect to security applications is the access control of the database itself. One might assume that if a data warehouse of analyzable information is put together, the analyst would have access to the entirety of it. This is however not necessarily the case since the information itself and its provenance may fall under different compartments.</p> +<p>So, let’s see how some of these aspects could be captured in the SNB context.</p> +<p>Geography - We materialize a table of travel events, so that an unbroken sequence of posts from the same location (e.g. country) other than the residence of the poster forms a travel event. The posts may have a fine grained position (IP, GPS coordinates of photos) that marks an itinerary. This is already beyond basicSQL, needing a procedure or window functions.</p> +<p>The communication between people is implicit in reply threads and forum memberships. A reply is the closest that one comes to a person to person message in the dataset. Otherwise all content is posted to forumns with more or less participants. Membership in a high traffic forum with few participants would indicate a strong connection. Calculating these time varying connection strengths is a lot of work and a lot of text in queries. Keeping things simple requires materializing a sparse “adjacency cube,” i.e. a relation of person1, person2, time bucket -&gt; connection strength. In the SNB case the connection strength may be derived from reciprocal replies, likes, being in the same forums, knowing each other etc. Selectivity is important, i.e. being in many small forumns together counts for more than being in ones where everybody else also participates.</p> +<p>The behaviors of people in SNB is not identical from person to person but for the same person follows a preset pattern. Suppose a question like “ which person with access to secrets has a marked change of online behavior?” The change would be starting or stopping communication with a given set of people, for example. Think that the spy meets the future spymaster in a public occasion, has a series of exchanges, travels to an atypical destination, then stops all open contact with the spymaster or related individuals. Patterns like this do not occur in the data but can be introduced easily enough.</p> +<p>In John Le Carre’s A Perfect Spy the main character is caught because it comes to light that his travel routes near always corresponded to his controller’s. This would make a query. This could be cast in marketing terms as a “(un)common shopping basket.”</p> +<p>Analytics becomes prediction when one part of a pattern exists without the expected next stage. Thus the same query template can serve for detecting full or partial instances of a pattern, depending on how the scores are interpreted.</p> +<p>From a database angle, these questions group on an item with internal structure. For the shopping basket this is a set. For the travel routes this is an ordered sequence of space/time points, with a match tolerance on the spatial and temporal elements. Another characteristic is that there is a baseline of expectations and the actual behavior. Both have structure, e.g. the occupation/location/interest/age of one’s social circle. These need to be condensed into a sort of metric space and then changes and rates of change can be observed. Again, this calls for a multidimensional cube to be created as a summary, then algorithms to be applied to this. The declarative BI query a la TPC-H does not easily capture this all.</p> +<p>This leads us to graph analytics in a broader sense. Some of the questions addressed here will still fit in the materialized summaries+declarative queries pattern but the more complex summarization and clustering moves towards iterative algorithms.</p> +<p>There is at present a strong interest in developing graph analytics benchmarks in LDBC. This is an activity that extends beyond the FP7 project duration and beyond the initial partners. To this effect I have implemented some SQL extensions for BSP style processing, as hinted at on my blog. These will be covered in more detail in January, when there are actual experiments.</p> + + + + + Sizing AWS Instances for the Semantic Publishing Benchmark + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + Wed, 17 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/sizing-aws-instances-for-the-semantic-publishing-benchmark/ + <p>LDBC&rsquo;s <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous <a href="http://www.bbc.co.uk/blogs/legacy/bbcinternet/2010/07/bbc_world_cup_2010_dynamic_sem.html">BBC Dynamic Semantic Publishing</a> scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank Lampard). As we <a href="https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues">wrote earlier</a>, SPB was already successfully used to help developers to identify performance issues and to introduce optimizations in SPARQL engines such as GraphDB and Virtuoso. Now we are at the point to experiment with different sizes of the benchmark and different hardware configurations.</p> +<p>Lately we tested different Amazon Web Services (<a href="https://aws.amazon.com/">AWS</a>) instance types for running SPB basic interactive query mix in parallel with the standard editorial updates – precisely the type of workload that <a href="https://www.ontotext.com/products/ontotext-graphdb/">GraphDB</a> experiences in the backend of BBC Sport website. We discovered and report below a number of practical guidelines about the optimal instance types and configurations. We have proven that SPB 50M workloads can be executed efficiently on a mid-sized AWS instance – c3.2xlarge machine executes 16 read queries and 15 update operations per second. For $1 paid to Amazon for such instance GraphDB executes 140 000 queries and 120 000 updates. The most interesting discovery in this experiment is that if BBC were hosting the triplestore behind their Dynamic Semantic Publishing architecture at AWS, the total cost of the server infrastructure behind their Worldcup 2010 website would have been about $80/day.</p> +<h3 id="the-experiment">The Experiment</h3> +<p>For our tests we use:</p> +<ul> +<li>GraphDB Standard v6.1</li> +<li>LDBC-SPB test driver (version 0.1.dc9a626 from 10.Nov.2014) configured as follows: +<ul> +<li>8 aggregation agents (read threads) and 2 editorial agents (write threads); for some configurations we experimented with different numbers of agents also</li> +<li>50M dataset (SF1)</li> +<li>40 minutes of benchmark run time (60 seconds of warm up)</li> +</ul> +</li> +<li>5 different Amazon EC2 instances and one local server</li> +</ul> +<p>Each test run is cold, i.e. data is newly loaded for each run. We set a 5 GByte cache configuration, which is sufficient for the size of the generated dataset. We use the same query substitution parameters (the same randomization seed) for every run, so that we are sure that all test runs are identical.</p> +<p>We use two types of instances – M3 and C3 instances. They both provide SSD storage for fast I/O performance. The M3 instances are with E5-2670v2, 2.50GHz CPU and provide good all-round performance, while the C3 instances are compute optimized with stronger CPU – E5-2680v2, 2.80GHz, but have half as much memory as the M3.</p> +<p>We also use a local physical server with dual-CPU – E5-2650v2, 2.60Ghz; 256GB of RAM and RAID-0 array of SSD in order to provide ground for interpretation of the performance for the virtualized AWS instances. The CPU capacity of the AWS instances is measured in vCPUs (virtual CPU). A vCPU is a logical core – one hyper-thread of one physical core of the corresponding Intel Xeon processor used by Amazon. This means that a vCPU represents roughly half a physical core, even though the performance of a hyper-threaded core is not directly comparable with two non-hyper-threaded cores. We should keep this in mind comparing AWS instances to physical machines, i.e. our local server with two CPUs with 8 physical cores each has 32 logical cores, which is more than c3.4xlarge instance with 16 vCPUs.</p> +<h3 id="the-results">The Results</h3> +<p>For the tests we measured:</p> +<ul> +<li><em>queries/s</em> for the read threads, where queries include SELECT and CONSTRUCT</li> +<li><em>updates/s</em> for the write threads, where an update operation is INSERT or DELETE</li> +<li><em>queries/$</em> and <em>updates/$</em> – respectively queries or updates per dollar is calculated for each AWS instance type based on price and update throughput</li> +<li><em>update/vCPU</em> – modification operations per vCPU per second</li> +</ul> +<p>Results (Table 1.) provide strong evidence that performance depends mostly on processor power. This applies to both queries and updates - which in the current AWS setup go on par with one another. Comparing M3 and C3 instances with equal vCPUs we can see that performance is only slightly higher for the M3 machines and even lower for selects with 8 vCPUs. Taking into account the lower price of C3 because of their lower memory, it is clear that C3 machines are better suited for this type of workload and the sweet spot between price and performance is c3.2xlarge machine.</p> +<p>The improvement in performance between the c3.xlarge and c3.2xlarge is more than twofold where the improvement between c3.2xlarge and c3.4xlarge is considerably lower. We also observe slower growth between c3.4xlarge and the local server machine. This is an indication that for SPB at this scale the difference between 7.5GB and 15GB of RAM is substantial, but RAM above this amount cannot be utilized efficiently by GraphDB.</p> +<p>Table 1. SPB Measurement Results on AWS and Local Servers</p> +<table> +<thead> +<tr> +<th>Server Type</th> +<th>vCPUs</th> +<th>R/W Agents</th> +<th>RAM (GB)</th> +<th>&ldquo;Storage (GB, SSD)&rdquo;</th> +<th>Price USD/h</th> +<th>Queries/ sec.</th> +<th>Updates/ sec.</th> +<th>Queries/ USD</th> +<th>Updates/ USD</th> +<th>Updates/ vCPU</th> +</tr> +</thead> +<tbody> +<tr> +<td>m3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>15</td> +<td>2x 40</td> +<td>0.28</td> +<td>8.39</td> +<td>8.23</td> +<td>107 882</td> +<td>105 873</td> +<td>2.06</td> +</tr> +<tr> +<td>m3.2xlarge</td> +<td>8</td> +<td>8/2</td> +<td>30</td> +<td>2x 80</td> +<td>0.56</td> +<td>15.44</td> +<td>15.67</td> +<td>99 282</td> +<td>100 752</td> +<td>1.96</td> +</tr> +<tr> +<td>c3.xlarge</td> +<td>4</td> +<td>8/2</td> +<td>7.5</td> +<td>2x 40</td> +<td>0.21</td> +<td>7.17</td> +<td>6.78</td> +<td>122 890</td> +<td>116 292</td> +<td>1.7</td> +</tr> +<tr> +<td><strong>c3.2xlarge</strong></td> +<td><strong>8</strong></td> +<td><strong>8/2</strong></td> +<td><strong>15</strong></td> +<td><strong>2x 80</strong></td> +<td><strong>0.42</strong></td> +<td><strong>16.46</strong></td> +<td><strong>14.56</strong></td> +<td><strong>141 107</strong></td> +<td><strong>124 839</strong></td> +<td><strong>1.82</strong></td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>8/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>23.23</strong></td> +<td><strong>21.17</strong></td> +<td><strong>99 578</strong></td> +<td><strong>90 736</strong></td> +<td><strong>1.32</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>8/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>22.89</td> +<td>20.39</td> +<td>98 100</td> +<td>87 386</td> +<td>1.27</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/2</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.6</td> +<td>19.11</td> +<td>114 000</td> +<td>81 900</td> +<td>1.19</td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>10/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>26.19</td> +<td>19.18</td> +<td>112 243</td> +<td>82 200</td> +<td>1.2</td> +</tr> +<tr> +<td><strong>c3.4xlarge</strong></td> +<td><strong>16</strong></td> +<td><strong>14/2</strong></td> +<td><strong>30</strong></td> +<td><strong>2x 160</strong></td> +<td><strong>0.84</strong></td> +<td><strong>30.84</strong></td> +<td><strong>16.88</strong></td> +<td><strong>132 171</strong></td> +<td><strong>72 343</strong></td> +<td><strong>1.06</strong></td> +</tr> +<tr> +<td>c3.4xlarge</td> +<td>16</td> +<td>14/3</td> +<td>30</td> +<td>2x 160</td> +<td>0.84</td> +<td>29.67</td> +<td>17.8</td> +<td>127 157</td> +<td>76 286</td> +<td>1.11</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.11</td> +<td>32.04</td> +<td>156 712</td> +<td>135 302</td> +<td>1</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>8/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>37.31</td> +<td>32.07</td> +<td>157 557</td> +<td>135 429</td> +<td>1</td> +</tr> +<tr> +<td><strong>Local</strong></td> +<td><strong>32</strong></td> +<td><strong>10/2</strong></td> +<td><strong>256</strong></td> +<td><strong>8x 256</strong></td> +<td><strong>0.85</strong></td> +<td><strong>40</strong></td> +<td><strong>31.01</strong></td> +<td><strong>168 916</strong></td> +<td><strong>130 952</strong></td> +<td><strong>0.97</strong></td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.39</td> +<td>26.42</td> +<td>153 672</td> +<td>111 569</td> +<td>0.83</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>14/3</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>36.22</td> +<td>26.39</td> +<td>152 954</td> +<td>111 443</td> +<td>0.82</td> +</tr> +<tr> +<td>Local</td> +<td>32</td> +<td>20/2</td> +<td>256</td> +<td>8x 256</td> +<td>0.85</td> +<td>34.59</td> +<td>23.86</td> +<td>146 070</td> +<td>100 759</td> +<td>0.75</td> +</tr> +</tbody> +</table> +<h3 id="the-optimal-number-of-test-agents">The Optimal Number of Test Agents</h3> +<p>Experimenting with different number of aggregation (read) and editorial (write) agents at c3.4xlarge and the local server, we made some interesting observations:</p> +<ul> +<li>There is almost no benefit to use more than 2 write agents. This can be explained by the fact that certain aspects of handling writes in GraphDB are serialized, i.e. they cannot be executed in parallel across multiple write threads;</li> +<li>Using more read agents can have negative impact on update performance. This is proven by the c3.4xlarge results with 8/2 and with 14/2 agents - while in the later case GraphDB handles a bit higher amount of queries (31 vs. 23) we see a drop in the updates rates (from 21 to 17);</li> +<li>Overall, the configuration with 8 read agents and 2 write agents delivers good balanced results across various hardware configurations;</li> +<li>For machines with more than 16 cores, a configuration like 10/2 or 14/2, would maximize the number of selects, still with good update rates. This way one can get 30 queries/sec. on c3.4xlarge and 40 queries/sec. on a local server;</li> +<li>Launching more than 14 read agents does not help even on local server with 32 logical cores. This indicates that at this point we are reaching some constraints such as memory bandwidth or IO throughput and degree of parallelization.</li> +<li>There is some overhead when handling bigger number of agents as the results for the local server tests with 14/3 and 20/2 show the worst results for both queries and updates.</li> +</ul> +<h3 id="efficiency-and-cost">Efficiency and Cost</h3> +<p>AWS instance type c3.2xlarge provides the best price/performance ratio for applications where 15 updates/sec. are sufficient even at peak times. More intensive applications should use type c3.4xlarge, which guarantees more than 20 updates/sec.</p> +<p>Cloud infrastructure providers like Amazon, allow one to have a very clear account of the full cost for the server infrastructure, including hardware, hosting, electricity, network, etc.</p> +<p>$1 spent on c3.2xlarge ($0.41/hour) allows for handling 140 000 queries, along with more than 120 000 update operations!</p> +<p>The full cost of the server infrastructure is harder to compute in the case of purchasing a server and hosting it in a proprietary data center. Still, one can estimate the upper limits - for machine, like the local server used in this benchmark, this price is way lower than $1/hour. One should consider that this machine is with 256GB of RAM, which is an overkill for Semantic Publishing Benchmark ran at 50M scale. Under all these assumptions we see that using local server is cheaper than the most cost-efficient AWS instance. This is expected - owning a car is always cheaper than renting it for 3 years in a row. Actually, the fact that the difference of the prices/query in this case are low indicates that using AWS services comes at very low extra cost.</p> +<p>To put these figures in the context of a known real world application, let us model the case of a GraphDB Enterprise replication cluster with 2 master nodes and 6 worker nodes - the size of cluster that BBC used for their FIFA Worldcup 2010 project. Given c3.2xlarge instance type, the math works as follows:</p> +<ul> +<li><strong>100 queries/sec.</strong> handled by the cluster. This means about 360 000 queries per hour or more than 4 million queries per day. This is at least 2 times more than the actual loads of GraphDB at BBC during the peak times of big sports events.</li> +<li><strong>10 updates/sec.</strong> - the speed of updates in GraphDB Enterprise cluster is lower than the speed of each worker node in separation. There are relatively few content management applications that need more than 36 000 updates per hour.</li> +<li><strong>$81/day</strong> is the full cost for the server infrastructure. This indicates an annual operational cost for cluster of this type in the range of $30 000, even without any effort to release some of the worker nodes in non-peak times.</li> +</ul> + + + + + DATAGEN: a Realistic Social Network Data Generator + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + Sat, 06 Dec 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-a-realistic-social-network-data-generator/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/getting-started-with-snb">Getting started with snb</a>, <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">DATAGEN: data generation for the Social Network Benchmark</a>), Arnau Prat discussed the main features and characteristics of DATAGEN: <em>realism</em>, <em>scalability</em>, <em>determinism</em>, <em>usability</em>. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments that will shed some light on how realistic data produced by DATAGEN looks. For our testing, we generated a dataset of scale factor 10 (i.e., social network of 73K users during 3 years) and loaded it into Virtuoso by following the <a href="https://github.com/ldbc/ldbc_snb_datagen">instructions for generating a SNB dataset</a> and <a href="https://github.com/ldbc/ldbc_snb_implementations/tree/master/interactive/virtuoso">for loading the dataset into Virtuoso</a>. In the following sections, we analyze several aspects of the generated dataset.</p> +<h3 id="a-realistic-social-graph">A Realistic social graph</h3> +<p>One of the most complexly structured graphs that can be found in the data produced by DATAGEN is the friends graph, formed by people and their <em><knows></em> relationships. We used the R script after Figure 1 to draw the social degree distribution in the SNB friends graph. As shown in Figure 1, the cumulative social degree distribution of the friends graph is similar to that from Facebook (See the note about <a href="https://www.facebook.com/notes/facebook-data-team/anatomy-of-facebook/10150388519243859">Facebook Anatomy</a>). This is not by chance, as DATAGEN has been designed to deliberately reproduce the Facebook&rsquo;s graph distribution.</p> +<p><img src="Cumulative-distribution.png" alt="image"> <br> +Figure 1: Cumulative distribution #friends per user</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-r" data-lang="r"><span style="display:flex;"><span><span style="color:#75715e">#R script for generating the social degree distribution </span> +</span></span><span style="display:flex;"><span><span style="color:#75715e">#Input files: person_knows_person_*.csv</span> +</span></span><span style="display:flex;"><span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(data.table) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(igraph) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">library</span>(plotrix) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">require</span>(bit64) +</span></span><span style="display:flex;"><span>dflist <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">lapply</span>(<span style="color:#a6e22e">commandArgs</span>(trailingOnly <span style="color:#f92672">=</span> <span style="color:#66d9ef">TRUE</span>), fread, sep<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;|&#34;</span>, +</span></span><span style="display:flex;"><span> header<span style="color:#f92672">=</span>T, select<span style="color:#f92672">=</span><span style="color:#ae81ff">1</span><span style="color:#f92672">:</span><span style="color:#ae81ff">2</span>, colClasses<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;integer64&#34;</span>) +</span></span><span style="display:flex;"><span> df <span style="color:#f92672">&lt;-</span> <span style="color:#a6e22e">rbindlist</span>(dflist) <span style="color:#a6e22e">setNames</span>(df, <span style="color:#a6e22e">c</span>(<span style="color:#e6db74">&#34;P1&#34;</span>, <span style="color:#e6db74">&#34;P2&#34;</span>)) +</span></span><span style="display:flex;"><span>d2 <span style="color:#f92672">&lt;-</span> df[,<span style="color:#a6e22e">length</span>(P2),by<span style="color:#f92672">=</span>P1] +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">pdf</span>(<span style="color:#e6db74">&#34;socialdegreedist.pdf&#34;</span>) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">plot</span>(<span style="color:#a6e22e">ecdf</span>(d2<span style="color:#f92672">$</span>V1),main<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Cummulative distribution #friends per user&#34;</span>, +</span></span><span style="display:flex;"><span> xlab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Number of friends&#34;</span>, ylab<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;Percentage number of users&#34;</span>, log<span style="color:#f92672">=</span><span style="color:#e6db74">&#34;x&#34;</span>, +</span></span><span style="display:flex;"><span> xlim<span style="color:#f92672">=</span><span style="color:#a6e22e">c</span>(<span style="color:#ae81ff">0.8</span>, <span style="color:#a6e22e">max</span>(d2<span style="color:#f92672">$</span>V1) <span style="color:#f92672">+</span> <span style="color:#ae81ff">20</span>)) +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">dev.off</span>() +</span></span></code></pre></div><h3 id="data-correlations">Data Correlations</h3> +<p>Data in real life as well as in a real social network is correlated; e.g. names of people living in Germany have a different distribution than those living in Netherlands, people who went to the same university in the same period have a much higher probability to be friends and so on and so forth. In this experiment we will analyze if data produced by DATAGEN also reproduces these phenomena.</p> +<p><em>Which are the most popular names of a country?</em></p> +<p>We run the following query on the database built in Virtuoso, which computes the distribution of the names of the people for a given country. In this query, <em>&lsquo;A_country_name&rsquo;</em> is the name of a particular country such as <em>&lsquo;Germany&rsquo;, &lsquo;Netherlands&rsquo;, or &lsquo;Vietnam&rsquo;</em>.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> p_lastname, <span style="color:#66d9ef">count</span> (p_lastname) <span style="color:#66d9ef">as</span> namecnt +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">FROM</span> person, country +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> p_placeid <span style="color:#f92672">=</span> ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;A_country_name&#39;</span> +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> p_lastname <span style="color:#66d9ef">order</span> <span style="color:#66d9ef">by</span> namecnt <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we can see from Figures 2, 3, and 4, the distributions of names in Germany, Netherlands and Vietnam are different. A name that is popular in Germany such as <em>Muller</em> is not popular in the Netherlands, and it even does not appear in the names of people in Vietnam. We note that the names&rsquo; distribution may not be exactly the same as the contemporary names&rsquo; distribution in these countries, since the names resource files used in DATAGEN are extracted from Dbpedia, which may contain names from different periods of time.</p> +<p><img src="distribution-germany.png" alt="image"> <br> +Figure 2. Distribution of names in Germany</p> +<p><img src="distribution-netherlands.png" alt=""> <br> +Figure 3. Distribution of names in Netherlands</p> +<p><img src="distribution-vietnam.png" alt=""> <br> +Figure 4. Distribution of names in Vietnam</p> +<p><em>Where my friends are living?</em></p> +<p>We run the following query, which computes the locations of the friends of people living in China.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> fctry.ctry_name, <span style="color:#66d9ef">count</span> (<span style="color:#f92672">*</span>) <span style="color:#66d9ef">from</span> person <span style="color:#66d9ef">self</span>, person +</span></span><span style="display:flex;"><span>friend, country pctry, knows, country fctry +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> pctry.ctry_name <span style="color:#f92672">=</span> <span style="color:#e6db74">&#39;China&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> <span style="color:#66d9ef">self</span>.p_placeid <span style="color:#f92672">=</span> pctry.ctry_city +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> <span style="color:#66d9ef">self</span>.p_personid <span style="color:#66d9ef">and</span> friend.p_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> fctry.ctry_city <span style="color:#f92672">=</span> friend.p_placeid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> fctry.ctry_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As shown in the graph, most of the friends of people living in China are also living in China. The rest comes predominantly from near-by countries such as India, Vietnam.</p> +<p><img src="chinese-friends.png" alt=""> <br> +Figure 5. Locations of friends of people in China</p> +<p><em>Where my friends are studying?</em></p> +<p>Finally, we run the following query to find where the friends of people studying at a specific university (e.g., “Hangzhou_International_School”) are studying at.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-sql" data-lang="sql"><span style="display:flex;"><span><span style="color:#66d9ef">SELECT</span> top <span style="color:#ae81ff">10</span> o2.o_name, <span style="color:#66d9ef">count</span>(o2.o_name) <span style="color:#66d9ef">from</span> knows, person_university +</span></span><span style="display:flex;"><span>p1, person_university p2, organisation o1, organisation o2 +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">WHERE</span> +</span></span><span style="display:flex;"><span> p1.pu_organisationid <span style="color:#f92672">=</span> o1.o_organisationid +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> o1.o_name<span style="color:#f92672">=</span><span style="color:#e6db74">&#39;Hangzhou_International_School&#39;</span> +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> k_person1id <span style="color:#f92672">=</span> p1.pu_personid <span style="color:#66d9ef">and</span> p2.pu_personid <span style="color:#f92672">=</span> k_person2id +</span></span><span style="display:flex;"><span> <span style="color:#66d9ef">and</span> p2.pu_organisationid <span style="color:#f92672">=</span> o2.o_organisationid +</span></span><span style="display:flex;"><span><span style="color:#66d9ef">GROUP</span> <span style="color:#66d9ef">BY</span> o2.o_name <span style="color:#66d9ef">ORDER</span> <span style="color:#66d9ef">BY</span> <span style="color:#ae81ff">2</span> <span style="color:#66d9ef">desc</span>; +</span></span></code></pre></div><p>As we see from Figure 6, most of the friends of the Hangzhou International School students also study at that university. This is a realistic correlation, as people studying at the same university have a much higher probability to be friends. Furthermore, top-10 universities for the friends of the Hangzhou School students’ are from China, while people from foreign universities have small number of friends that study in Hangzhou School (See Table 1).</p> +<p><img src="friends-international-school.png" alt=""> <br> +Figure 6. Top-10 universities where the friends of Hangzhou International School students are studying at.</p> +<table> +<thead> +<tr> +<th>Name</th> +<th># of friends</th> +</tr> +</thead> +<tbody> +<tr> +<td>Hangzhou_International_School</td> +<td>12696</td> +</tr> +<tr> +<td>Anhui_University_of_Science_and_Technology</td> +<td>4071</td> +</tr> +<tr> +<td>China_Jiliang_University</td> +<td>3519</td> +</tr> +<tr> +<td>&hellip;</td> +<td></td> +</tr> +<tr> +<td>Darmstadt_University_of_Applied_Sciences</td> +<td>1</td> +</tr> +<tr> +<td>Calcutta_School_of_Tropical_Medicine</td> +<td>1</td> +</tr> +<tr> +<td>Chettinad_Vidyashram</td> +<td>1</td> +</tr> +<tr> +<td>Women&rsquo;s_College_Shillong</td> +<td>1</td> +</tr> +<tr> +<td>Universitas_Nasional</td> +<td>1</td> +</tr> +</tbody> +</table> +<p>Table 1. Universities where friends of Hangzhou International School students are studying at.</p> +<p>In a real social network, data is riddled with many more correlations; it is a true data mining task to extract these. Even though DATAGEN may not be able to model all the real life data correlations, it can generate a dataset that reproduce many of those important characteristics found in a real social network, and additionally introduce a series of plausible correlations in it. More and more interesting data correlations may also be found from playing with the SNB generated data.</p> + + + + + SNB Driver - Part 1 + https://ldbcouncil.org/post/snb-driver-part-1/ + Thu, 27 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-driver-part-1/ + <p>In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: <a href="https://github.com/ldbc/ldbc_driver/">https://github.com/ldbc/ldbc_driver/</a>. Multiple reference implementations by two vendors are available here: <a href="https://github.com/ldbc/ldbc_snb_implementations">https://github.com/ldbc/ldbc_snb_implementations</a>, and discussion of the schema, data properties, and related content is available here: <a href="https://github.com/ldbc/ldbc_snb_docs">https://github.com/ldbc/ldbc_snb_docs</a>.</p> +<p>The following will concentrate on key decisions and techniques that were developed to support scalable, repeatable, distributed workload execution.</p> +<h3 id="problem-description">Problem Description</h3> +<p>The driver generates a stream of operations (e.g. create user, create post, create comment, retrieve person&rsquo;s posts etc.) and then executes them using the provided database connector. To be capable of generating heavier loads, it executes the operations from this stream in parallel. If there were no dependencies between operations (e.g., reads that depend on the completion of writes) this would be trivial. This is the case, for example, for the classical TPC-C benchmark, where splitting transaction stream into parallel clients (terminals) is trivial. However, for LDBC SNB Interactive Workload this is not the case: some operations within the stream do depend on others, others are depended on, some both depend on others and are depended on, and some neither depend on others nor are they depended on.</p> +<p>Consider, for example, a Social Network Benchmark scenario, where the data generator outputs a sequence of events such as User A posted a picture, User B left a comment to the picture of User A, etc. The second event depends on the first one in a sense that there is a causal ordering between them: User B can only leave a comment on the picture once it has been posted. The generated events are already ordered by their time stamp, so in case of the single-threaded execution this ordering is observed by default: the driver issues a request to the SUT with the first event (i.e., User A posts a picture), after its completion it issues the second event (create a comment). However, if events are executed in parallel, these two events may end up in different parallel sequences of events. Therefore, a driver needs a mechanism to ensure the dependency is observed even when the dependent events are in different parallel update streams.</p> +<p>The next blog entries in this series will discuss the approaches used in the driver to deal with these challenges.</p> + + + + + Making Semantic Publishing Execution Rules + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + Tue, 18 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-semantic-publishing-execution-rules/ + <p><a href="https://ldbcouncil.org/">LDBC</a> <a href="https://ldbcouncil.org/benchmarks/spb">SPB (Semantic Publishing Benchmark)</a> is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC&rsquo;s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an <a href="https://www.ontotext.com/products/ontotext-graphdb-owlim/">Ontotext Graph DB</a> deployment. Graph DB was formerly known as Owlim.</p> +<p>So, in SPB we wanted to address substantially more complex queries than the lookups that the BBC linked data platform primarily serves. Diverse dataset summaries, timelines and faceted search qualified by keywords and/or geography are examples of online user experience that SPB needs to cover.</p> +<p>SPB is not per se an analytical workload but we still find that the queries fall broadly in two categories:</p> +<ul> +<li> +<p>Some queries are centred on a particular search or entity. The data touched by the query size does not grow at the same rate as the dataset.</p> +</li> +<li> +<p>Some queries cover whole cross sections of the dataset, e.g. find the most popular tags across the whole database.</p> +</li> +</ul> +<p>These different classes of questions need to be separated in a metric, otherwise the short lookup dominates at small scales and the large query at large scales.</p> +<p>Another guiding factor of SPB was the BBC&rsquo;s and others&rsquo; express wish to cover operational aspects such as online backups, replication and fail-over in a benchmark. True, most online installations have to deal with these things, which are yet as good as absent from present benchmark practice. We will look at these aspects in a different article, for now, I will just discuss the matter of workload mix and metric.</p> +<p>Normally the lookup and analytics workloads are divided into different benchmarks. Here we will try something different. There are three things the benchmark does:</p> +<ul> +<li> +<p>Updates - These sometimes insert a graph, sometimes delete and re-insert the same graph, sometimes just delete a graph. These are logarithmic to data size.</p> +</li> +<li> +<p>Short queries - These are lookups that most often touch on recent data and can drive page impressions. These are roughly logarithmic to data scale.</p> +</li> +<li> +<p>Analytics - These cover a large fraction of the dataset and are roughly linear to data size.</p> +</li> +</ul> +<p>A test sponsor can decide on the query mix within certain bounds. A qualifying run must sustain a minimum, scale-dependent update throughput and must execute a scale-dependent number of analytical query mixes or run for a scale-dependent duration. The minimum update rate, the minimum number of analytics mixes and the minimum duration all grow logarithmically to data size. Within these limits, the test sponsor can decide how to mix the workloads. Publishing several results, emphasizing different aspects is also possible. A given system may be specially good at one aspect, leading the test sponsor to accentuate this.</p> +<p>The benchmark has been developed and tested at small scales, between 50 and 150M triples. Next we need to see how it actually scales. There we expect to see how the two query sets behave differently. One effect that we see right away when loading data is that creating the full text index on the literals is in fact the longest running part. For a SF 32 ( 1.6 billion triples) SPB database we have the following space consumption figures:</p> +<ul> +<li> +<p>46886 MB of RDF literal text</p> +</li> +<li> +<p>23924 MB of full text index for RDF literals</p> +</li> +<li> +<p>23598 MB of URI strings</p> +</li> +<li> +<p>21981 MB of quads, stored column-wise with default index scheme</p> +</li> +</ul> +<p>Clearly, applying column-wise compression to the strings is the best move for increasing scalability. The literals are individually short, so literal per literal compression will do little or nothing but applying this by the column is known to get a 2x size reduction with Google Snappy. The full text index does not get much from column store techniques, as it already consists of words followed by space efficient lists of word positions. The above numbers are measured with Virtuoso column store, with quads column wise and the rest row-wise. Each number includes the table(s) and any extra indices associated to them.</p> +<p>Let&rsquo;s now look at a full run at unit scale, i.e. 50M triples.</p> +<p>The run rules stipulate a minimum of 7 updates per second. The updates are comparatively fast, so we set the update rate to 70 updates per second. This is seen not to take too much CPU. We run 2 threads of updates, 20 of short queries and 2 of long queries. The minimum run time for the unit scale is 10 minutes, so we do 10 analytical mixes, as this is expected to take 10 a little over 10 minutes. The run stops by itself when the last of the analytical mixes finishes.</p> +<p>The interactive driver reports:</p> +<pre tabindex="0"><code>Seconds run : 2144 + Editorial: + 2 agents + + 68164 inserts (avg : 46 ms, min : 5 ms, max : 3002 ms) + 8440 updates (avg : 72 ms, min : 15 ms, max : 2471 ms) + 8539 deletes (avg : 37 ms, min : 4 ms, max : 2531 ms) + + 85143 operations (68164 CW Inserts (98 errors), 8440 CW Updates (0 errors), 8539 CW Deletions (0 errors)) + 39.7122 average operations per second + + Aggregation: + 20 agents + + 4120 Q1 queries (avg : 789 ms, min : 197 ms, max : 6767 ms, 0 errors) + 4121 Q2 queries (avg : 85 ms, min : 26 ms, max : 3058 ms, 0 errors) + 4124 Q3 queries (avg : 67 ms, min : 5 ms, max : 3031 ms, 0 errors) + 4118 Q5 queries (avg : 354 ms, min : 3 ms, max : 8172 ms, 0 errors) + 4117 Q8 queries (avg : 975 ms, min : 25 ms, max : 7368 ms, 0 errors) + 4119 Q11 queries (avg : 221 ms, min : 75 ms, max : 3129 ms, 0 errors) + 4122 Q12 queries (avg : 131 ms, min : 45 ms, max : 1130 ms, 0 errors) + 4115 Q17 queries (avg : 5321 ms, min : 35 ms, max : 13144 ms, 0 errors) + 4119 Q18 queries (avg : 987 ms, min : 138 ms, max : 6738 ms, 0 errors) + 4121 Q24 queries (avg : 917 ms, min : 33 ms, max : 3653 ms, 0 errors) + 4122 Q25 queries (avg : 451 ms, min : 70 ms, max : 3695 ms, 0 errors) + + 22.5239 average queries per second. Pool 0, queries [ Q1 Q2 Q3 Q5 Q8 Q11 Q12 Q17 Q18 Q24 Q25 ] + + 45318 total retrieval queries (0 timed-out) + 22.5239 average queries per second +</code></pre><p>The analytical driver reports:</p> +<pre tabindex="0"><code>Aggregation: + 2 agents + + 14 Q4 queries (avg : 9984 ms, min : 4832 ms, max : 17957 ms, 0 errors) + 12 Q6 queries (avg : 4173 ms, min : 46 ms, max : 7843 ms, 0 errors) + 13 Q7 queries (avg : 1855 ms, min : 1295 ms, max : 2415 ms, 0 errors) + 13 Q9 queries (avg : 561 ms, min : 446 ms, max : 662 ms, 0 errors) + 14 Q10 queries (avg : 2641 ms, min : 1652 ms, max : 4238 ms, 0 errors) + 12 Q13 queries (avg : 595 ms, min : 373 ms, max : 1167 ms, 0 errors) + 12 Q14 queries (avg : 65362 ms, min : 6127 ms, max : 136346 ms, 2 errors) + 13 Q15 queries (avg : 45737 ms, min : 12698 ms, max : 59935 ms, 0 errors) + 13 Q16 queries (avg : 30939 ms, min : 10224 ms, max : 38161 ms, 0 errors) + 13 Q19 queries (avg : 310 ms, min : 26 ms, max : 1733 ms, 0 errors) + 12 Q20 queries (avg : 13821 ms, min : 11092 ms, max : 15435 ms, 0 errors) + 13 Q21 queries (avg : 36611 ms, min : 14164 ms, max : 70954 ms, 0 errors) + 13 Q22 queries (avg : 42048 ms, min : 7106 ms, max : 74296 ms, 0 errors) + 13 Q23 queries (avg : 48474 ms, min : 18574 ms, max : 93656 ms, 0 errors) + 0.0862 average queries per second. Pool 0, queries [ Q4 Q6 Q7 Q9 Q10 Q13 Q14 Q15 Q16 Q19 Q20 Q21 Q22 Q23 ] + + 180 total retrieval queries (2 timed-out) + 0.0862 average queries per second +</code></pre><p>The metric would be 22.52 qi/s, 310 qa/h, 39.7 u/s @ 50Mt (SF 1)</p> +<p>The SUT is dual Xeon E5-2630, all in memory. The platform utilization is steadily above 2000% CPU (over 20/24 hardware threads busy on the DBMS). The DBMS is Virtuoso open source, (<a href="https://github.com/v7fasttrack/virtuoso-opensource/">v7fasttrack at github.com</a>, <a href="https://github.com/v7fasttrack/virtuoso-opensource/tree/feature/analytics">feature/analytics</a>).</p> +<p>The minimum update rate of 7/s was sustained but fell short of the target of 70./s. In this run, most demand was put on the interactive queries. Different thread allocations would give different ratios of the metric components. The analytics mix is for example about 3x faster without other concurrent activity.</p> +<p>Is this good or bad? I would say that this is possible but better can certainly be accomplished.</p> +<p>The initial observation is that Q17 is the worst of the interactive lot. 3x better is easily accomplished by avoiding a basic stupidity. The query does the evil deed of checking for a substring in a URI. This is done in the wrong place and accounts for most of the time. The query is meant to test geo retrieval but ends up doing something quite different. Optimizing this right would almost double the interactive score. There are some timeouts in the analytical run, which as such disqualifies the run. This is not a fully compliant result but is close enough to give an idea of the dynamics. So we see that the experiment is definitely feasible, is reasonably defined and that the dynamics seen make sense.</p> +<p>As an initial comment of the workload mix, I&rsquo;d say that interactive should have a few more very short point lookups to stress compilation times and give a higher absolute score of queries per second.</p> +<p>Adjustments to the mix will depend on what we find out about scaling. As with SNB, it is likely that the workload will shift a little, so this result might not be comparable with future ones.</p> +<p>In the next SPB article, we will look closer at performance dynamics and choke points and will have an initial impression on scaling the workload.</p> + + + + + Fifth TUC Meeting + https://ldbcouncil.org/event/fifth-tuc-meeting/ + Fri, 14 Nov 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fifth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce its fifth Technical User<br> +Community (TUC) meeting.</p> +<p>This will be a one-day event at the National Hellenic Research Institute<br> +in Athens, Greece on <strong>Friday November 14, 2014</strong>.</p> +<h3 id="agenda">Agenda</h3> +<p>10:30 - 11:00 Coffee Break</p> +<p>11:00 - 11:10 Peter Boncz (VUA) Welcome &amp; LDBC project status update (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979841.pptx">Presentation</a>)</p> +<p>11:10 - 11:25 Venelin Kotsev (ONTO) Semantic Publishing Benchmark:Short Presentation of SPB and Status</p> +<p>Feedback &amp; Roadmap for SPB &amp; OWLIM (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979839.pdf">Presentation</a>)</p> +<p>11:25 - 11:30 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SPB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979828.pdf">Presentation</a>)</p> +<p>11:30 - 11:45 Alex Averbuch (NEO) Social Network Benchmark: Short Presentation of SNB and Status, Feedback &amp; Roadmap for SNB &amp; Neo4J (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979830.pdf">Presentation</a>)</p> +<p>11:45 - 12:00 Orri Erling (OGL) Status, Feedback &amp; Roadmap for SNB &amp; Virtuoso (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979829.pdf">Presentation</a>)</p> +<p>12:00 - 12:20 Arnau Prat (UPC) &amp; Andrey Gubichev Status, Feedback &amp; Roadmap for SNB Interactive &amp; Sparksee (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979836.pdf">Presentation</a> ) and Business Intelligence (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979837.pdf">Presentation</a>)</p> +<p>12:20 - 12:40 Tomer Sagi, &ldquo;Experience with SNB and TitanDB at HP&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979838.pptx">Presentation</a> )</p> +<p>12:40 - 13:00 Jakob Nelson, &ldquo;graphbench.org on the SNB datagen&rdquo;</p> +<p>13:00 - 14:30 Lunch Break@Byzantine &amp; Christian Museum (<a href="http://www.byzantinemuseum.gr/en/">link</a>)</p> +<p>14:30 - 14:50 Olaf Hartig, &ldquo;Integrating the Property Graph and RDF data models&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979831.pdf">Presentation</a>)\</p> +<p>Documents: <a href="http://arxiv.org/abs/1409.3288">arxiv/1409.3288</a>, <a href="http://arxiv.org/abs/1406.3399">arxiv/1406.3399</a></p> +<p>14:50 - 15:10 Maria-Esther Vidal and Maribel Acosta, &ldquo;Challenges to be addressed during Benchmarking SPARQL Federated Engines&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979842.pdf">Presentation</a>)</p> +<p>15:10 - 15:30 Evaggelia Pitoura, &ldquo;Historical Queries on Graphs&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979835.pdf">Presentation</a>)</p> +<p>15:30 - 16:00 Coffee Break</p> +<p>16:00 - 16:20 Manolis Terrovitis, Giannis Liagos, George Papastefanatos, &ldquo;Efficient Identification of Implicit Facts in Incomplete OWL2-EL Knowledge Bases&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979843.pdf">Presentation</a>)</p> +<p>16:20 - 16:40 Gunes Aluc, &ldquo;WatDiv: How to Tune-up your RDF Data Management System&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979832.pdf">Presentation</a>)</p> +<p>16:40 - 17:00 Giorgos Kollias, Yannis Smaragdakis, &ldquo;Benchmarking @LogicBlox&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979840.pdf">Presentation</a>)</p> +<p>17:00 - 17:15 Hassan Chafi, &ldquo;Oracle Labs Graph Strategy&rdquo;</p> +<p>17:15 - 17:25 Yinglong Xia, &ldquo;Property Graphs for Industry Solution at IBM&rdquo; (<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/6979834.pdf">Presentation</a>)</p> +<p>17:25 - 17:30 Arthur Keen, &ldquo;Short Introduction to SPARQLcity&rdquo;</p> +<p><em><strong>20:30 Dinner @ Konservokouti <a href="https://plus.google.com/114240752029716758955/about?gl=gr&amp;hl=en">(link)</a></strong></em></p> +<p><em><strong>Get a Taxi, and go to Ippokratous 148, Athens, Neapoli Exarheion</strong></em></p> +<h4 id="logistics">Logistics</h4> +<p>The meeting will be held at the <a href="http://www.eie.gr/index-en.html">National Hellenic Research Foundation</a> located in <a href="http://www.eie.gr/location-en.html">downtown Athens</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fifth-tuc-meeting/attachments/5996808/5964344.gif" alt=""></p> +<h4 id="travel">Travel</h4> +<p>Athens, Greece&rsquo;s capital city, is easily accessible by air. Travelers on flights to Athens will land at Athens Eleftherios Venizelos International Airport.</p> +<p>To arrive in the city center, you can take the metro from the airport (Line #3) and stop at either stop Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations. You can also take express Bus X95 and stop again at either Evangelismos (ΕΥΑΓΓΕΛΙΣΜΟΣ) or at Syntagma (ΣΥΝΤΑΓΜΑ) stations (the latter is the terminus for the bus).</p> +<p>You can also take a taxi from the airport that runs on a fixed price for the city center (45 euros). More information on how to move around in Athens from the airport can be found here: <a href="http://www.aia.gr/traveler/">http://www.aia.gr/traveler/</a></p> + + + + + Getting Started With the Semantic Publishing Benchmark + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + Sun, 09 Nov 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-the-semantic-publishing-benchmark/ + <p>The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies proposed by BBC that define numerous properties for content; they contain asll RDFS schema constructs and certain OWL ones.</p> +<p>The benchmark proposes a data generator that uses the ontologies provided by BBC and reference datasets (again provided by BBC) to produce a set of valid instances; it works with a predefined set of distributions derived from the reference datasets. In addition to these distributions, the data generator also models:</p> +<ul> +<li>clustering of creative works around certain entities from the reference datasets (e.g. the association of an entity with creative works would decay exponentially in time)</li> +<li>correlations between entities - there will be creative works about two entities for a certain period in time, that way a history of interactions is also modelled (e.g. J. Biden and B. Obama are tagged in creative works for a continuous period in time)</li> +</ul> +<p>The driver proposed by the benchmark measures the performance of CRUD operations of a SPARQL endpoint by starting a number of concurrently running editorial and aggregation agents. The former executes a series of insert, update and delete operations, whereas the latter a set of construct, describe, and select queries on a SPARQL endpoint. The benchmark can access all SPARQL endpoints that support the SPARQL 1.1 protocol. Tests have been run on OWLIM and Virtuoso. Attempts were also made for Stardog.</p> +<p>Currently, the benchmark offers two workloads: a base version that consists of a mix of nine queries of different complexity that consider nearly all the features of SPARQL 1.1 query language including sorting, subqueries, limit, regular expressions and grouping. The queries aim at checking different choke points relevant to query optimisation such as:</p> +<ul> +<li>join ordering based on cardinality constraints - expressed by the different kinds of properties defined in the schema</li> +<li>subselects that aggregate the query results that the optimiser should recognise and evaluate first</li> +<li>optional and nested optional clauses where the optimiser is called to produce a plan where the execution of the optional triple patterns is performed last</li> +<li>reasoning along the RDFS constructs (subclass, subproperty hierarchies, functional, object and transitive properties etc.)</li> +<li>unions to be executed in parallel</li> +<li>optionals that contain filter expressions that should be executed as early as possible in order to eliminate intermediate results</li> +<li>ordering where the optimiser could consider the possibility to choose query plan(s) that facilitate the ordering of results</li> +<li>handling of geo-spatial predicates</li> +<li>full-text search optimisation</li> +<li>asynchronous execution of the aggregate sub-queries</li> +<li>use of distinct to choose the optimal query plan</li> +</ul> +<p>We give below Query 1 of the Semantic Publishing Benchmark.</p> +<pre tabindex="0"><code>PREFIX bbcevent:&lt;http://www.bbc.co.uk/ontologies/event/&gt; +PREFIX geo-pos:&lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; +PREFIX bbc:&lt;http://www.bbc.co.uk/ontologies/bbc/&gt; +PREFIX time:&lt;http://www.w3.org/2006/time#&gt; +PREFIX event:&lt;http://purl.org/NET/c4dm/event.owl#&gt; +PREFIX music-ont:&lt;http://purl.org/ontology/mo/&gt; +PREFIX rdf:&lt;http://www.w3.org/1999/02/22-rdf-syntax-ns#&gt; +PREFIX foaf:&lt;http://xmlns.com/foaf/0.1/&gt; +PREFIX provenance:&lt;http://www.bbc.co.uk/ontologies/provenance/&gt; +PREFIX owl:&lt;http://www.w3.org/2002/07/owl#&gt; +PREFIX cms:&lt;http://www.bbc.co.uk/ontologies/cms/&gt; +PREFIX news:&lt;http://www.bbc.co.uk/ontologies/news/&gt; +PREFIX cnews:&lt;http://www.bbc.co.uk/ontologies/news/cnews/&gt; +PREFIX cconcepts:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX dbp-prop:&lt;http://dbpedia.org/property/&gt; +PREFIX geonames:&lt;http://sws.geonames.org/&gt; +PREFIX rdfs:&lt;http://www.w3.org/2000/01/rdf-schema#&gt; +PREFIX domain:&lt;http://www.bbc.co.uk/ontologies/domain/&gt; +PREFIX dbpedia:&lt;http://dbpedia.org/resource/&gt; +PREFIX geo-ont:&lt;http://www.geonames.org/ontology#&gt; +PREFIX bbc-pont:&lt;http://purl.org/ontology/po/&gt; +PREFIX tagging:&lt;http://www.bbc.co.uk/ontologies/tagging/&gt; +PREFIX sport:&lt;http://www.bbc.co.uk/ontologies/sport/&gt; +PREFIX skosCore:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX dbp-ont:&lt;http://dbpedia.org/ontology/&gt; +PREFIX xsd:&lt;http://www.w3.org/2001/XMLSchema#&gt; +PREFIX core:&lt;http://www.bbc.co.uk/ontologies/coreconcepts/&gt; +PREFIX curric:&lt;http://www.bbc.co.uk/ontologies/curriculum/&gt; +PREFIX skos:&lt;http://www.w3.org/2004/02/skos/core#&gt; +PREFIX cwork:&lt;http://www.bbc.co.uk/ontologies/creativework/&gt; +PREFIX fb:&lt;http://rdf.freebase.com/ns/&gt; + +# Query Name : query1 +# Query Description : +# Retrieve creative works about thing t (or that mention t) +# reasoning: rdfs:subClassOf, rdf:type +# join ordering: cwork:dateModified rdf:type owl:FunctionalProperty +# join ordering: cwork:dateCreated rdf:type owl:FunctionalProperty +# Choke Points : +# - join ordering based on cardinality of functional proerties cwork:dateCreated, cwork:dateModified +# Optimizer should use an efficient cost evaluation method for choosing the optimal join tree +# - A sub-select which aggregates results. Optimizer should recognize it and execute it first +# - OPTIONAL and nested OPTIONAL clauses (treated by query optimizer as nested sub-queries) +# Optimizer should decide to put optional triples on top of the join tree +# (i.e. delay their execution to the last possible moment) because OPTIONALs are treated as a left join +# - qiery optimizer has the chance to recognize the triple pattern : ?cWork a ?type . ?type rdfs:subClassOf cwork:CreativeWork +# and eliminate first triple (?cwork a ?type .) since ?cwork is a cwork:CreativeWork​ + +CONSTRUCT { + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:shortTitle ?shortTitle ; + cwork:about ?about ; + cwork:mentions ?mentions ; + cwork:dateCreated ?created ; + cwork:dateModified ?modified ; + cwork:description ?description ; + cwork:primaryFormat ?primaryFormat ; + bbc:primaryContentOf ?webDocument . + ?webDocument bbc:webDocumentType ?webDocType . + ?about rdfs:label ?aboutLabel ; + bbc:shortLabel ?aboutShortLabel ; + bbc:preferredLabel ?aboutPreferredLabel . + ?mentions rdfs:label ?mentionsLabel ; + bbc:shortLabel ?mentionsShortLabel ; + bbc:preferredLabel ?mentionsPreferredLabel . + ?creativeWork cwork:thumbnail ?thumbnail . + ?thumbnail a cwork:Thumbnail ; + cwork:altText ?thumbnailAltText ; + cwork:thumbnailType ?thumbnailType . +} +WHERE { + { + SELECT ?creativeWork + WHERE { + ?creativeWork {{{cwAboutOrMentions}}} {{{cwAboutOrMentionsUri}}} . + ?creativeWork a cwork:CreativeWork ; + cwork:dateModified ?modified . + } + ORDER BY DESC(?modified) + LIMIT 10 + } + ?creativeWork a cwork:CreativeWork ; + a ?type ; + cwork:title ?title ; + cwork:dateModified ?modified . + OPTIONAL { ?creativeWork cwork:shortTitle ?shortTitle . } + OPTIONAL { ?creativeWork cwork:description ?description . } + OPTIONAL { ?creativeWork cwork:about ?about . + OPTIONAL { ?about rdfs:label ?aboutLabel . } + OPTIONAL { ?about bbc:shortLabel ?aboutShortLabel . } + OPTIONAL { ?about bbc:preferredLabel ?aboutPreferredLabel . } + } + OPTIONAL { + ?creativeWork cwork:mentions ?mentions . + OPTIONAL { ?mentions rdfs:label ?mentionsLabel . } + OPTIONAL { ?mentions bbc:shortLabel ?mentionsShortLabel . } + OPTIONAL { ?mentions bbc:preferredLabel ?mentionsPreferredLabel . } + } + OPTIONAL { ?creativeWork cwork:dateCreated ?created . } + OPTIONAL { ?creativeWork cwork:primaryFormat ?primaryFormat . } + OPTIONAL { ?webDocument bbc:primaryContent ?creativeWork . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork bbc:primaryContentOf ?webDocument . + OPTIONAL { ?webDocument bbc:webDocumentType ?webDocType . } + } + OPTIONAL { ?creativeWork cwork:thumbnail ?thumbnail . + OPTIONAL { ?thumbnail cwork:altText ?thumbnailAltText . } + OPTIONAL { ?thumbnail cwork:thumbnailType ?thumbnailType . } + } +} +</code></pre><p>Listing 1. Semantic Publishing Benchmark: Query 1</p> +<p>The benchmark test driver is distributed as a jar file, but can also be built using an ant script. It is distributed with the BBC ontologies and reference datasets, the queries and update workloads discussed earlier and the configuration parameters for running the benchmark and for generating the data. It is organised in the following different phases: ontology loading and reference dataset loading, dataset generation and loading, warm up (where a series of aggregation queries are run for a predefined amount of time), benchmark where all queries (aggregation and editorial) are run, conformance checking (that allows one to check whether the employed RDF engine implements OWL reasoning) and finally cleanup that removes all the data from the repository. The benchmark provides a certain degree of freedom where each phase can run independently of the others.</p> +<p>The data generator uses an RDF repository to load ontologies and reference datasets; actually, any system that will be benchmarked should have those ontologies loaded. Any repository that will be used for the data generation should be set up with context indexing, and finally geo-spatial indexing, if available, to serve the spatial queries. The current version of the benchmark has been tested with Virtuoso and OWLIM.</p> +<p>The generator uses configuration files that must be configured appropriately to set the values regarding the dataset size to produce, the number of aggregation and editorial agents, the query time out etc. The distributions used by the data generator could also be edited. The benchmark is very simple to run (once the RDF repository used to store the ontologies and the reference datasets is set up, and the configuration files updated appropriately) using the command: java -jar semantic_publishing_benchmark-*.jar test.properties. The benchmark produces three kinds of files that contain (a) brief information about each executed query, the size of the returned result, and the execution time (semantic_publishing_benchmark_queries_brief.log), (b) the detailed log of each executed query and its result (semantic_publishing_benchmark_queries_detailed.log) (c) the benchmark results (semantic_publishing_benchmark_results.log ).</p> +<p>Below we give an example of a run of the benchmark for OWLIM-SE. The benchmark reports the number of edit operations (inserts, updates, and writes) and queries executed at the Nth second of a benchmark run. It also reports that total number of retrieval queries as well as the average number of queries executed per second.</p> +<pre tabindex="0"><code>Seconds run : 600 + Editorial: + 0 agents + + 0 operations (0 CW Inserts, 0 CW Updates, 0 CW Deletions) + 0.0000 average operations per second + + Aggregation: + 8 agents + + 298 Q1 queries + 267 Q2 queries + 243 Q3 queries + 291 Q4 queries + 320 Q5 queries + 286 Q6 queries + 255 Q7 queries + 274 Q8 queries + 271 Q9 queries + + 2505 total retrieval queries + 4.1750 average queries per second +</code></pre><p>Listing 2. A snippet of semantic_publishing_benchmark_results.log</p> +<p>We run the benchmark under the following configuration: we used 8 aggregation agents for query execution and 4 data generator workers all running in parallel. The warm up period is 120 seconds during which a number of aggregation agents is executed to prepare the tested systems for query execution. Aggregation agents run for a period of 600 seconds, and queries timeout after 90 seconds. We used 10 sets of substitution parameters for each query. For data generation, ontologies and reference datasets are loaded in the OWLIM-SE repository. We used OWLIM-SE, Version 5.4.6287 with Sesame Version 2.6 and Tomcat Version 6. The results we obtained for the 10M, 100M and 1B triple datasets are given in the table below:</p> +<table> +<thead> +<tr> +<th>#triples</th> +<th>Q1</th> +<th>Q2</th> +<th>Q3</th> +<th>Q4</th> +<th>Q5</th> +<th>Q6</th> +<th>Q7</th> +<th>Q8</th> +<th>Q9</th> +<th>#queries</th> +<th>avg. #q. per sec.</th> +</tr> +</thead> +<tbody> +<tr> +<td>10M</td> +<td>298</td> +<td>267</td> +<td>243</td> +<td>291</td> +<td>320</td> +<td>286</td> +<td>255</td> +<td>274</td> +<td>271</td> +<td>2505</td> +<td>41,750</td> +</tr> +<tr> +<td>100M</td> +<td>53</td> +<td>62</td> +<td>51</td> +<td>52</td> +<td>44</td> +<td>62</td> +<td>25</td> +<td>55</td> +<td>45</td> +<td>449</td> +<td>7,483</td> +</tr> +<tr> +<td>1B</td> +<td>34</td> +<td>29</td> +<td>22</td> +<td>24</td> +<td>25</td> +<td>29</td> +<td>0</td> +<td>29</td> +<td>28</td> +<td>220</td> +<td>3,667</td> +</tr> +</tbody> +</table> + + + + + Choke Point Based Benchmark Design + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/choke-point-based-benchmark-design/ + <p>The <em>Linked Data Benchmark Council</em> (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its <a href="https://ldbcouncil.org">https://ldbcouncil.org</a> website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction with the LDBC Technical User Community (<a href="https://ldbcouncil.org/event/fifth-tuc-meeting">next TUC meeting</a> will be on October 5 in Athens) and indeed in <em>designing benchmarks</em>.</p> +<p>So, what makes a good benchmark design? Many talented people have paved our way in addressing this question and for relational database systems specifically the benchmarks produced by <a href="http://www.tpc.org/">TPC</a> have been very helpful in maturing relational database technology, and making it successful. Good benchmarks are <em>relevant</em> and <em>representative</em> (address important challenges encountered in practice), <em>understandable</em> , <em>economical</em> (implementable on simple hardware), <em>fair</em> (such as not to favor a particular product or approach), <em>scalable</em>, <em>accepted</em> by the community and <em>public</em> (e.g. all of its software is available in open source). This list stems from Jim Gray&rsquo;s <a href="http://research.microsoft.com/en-us/um/people/gray/BenchmarkHandbook/TOC.htm">Benchmark Handbook</a>. In this blogpost, I will share some thoughts on each of these aspects of good benchmark design.</p> +<p>A very important aspect of benchmark development is making sure that the community <em>accepts</em> a certain benchmark, and starts using it. A benchmark without published results and therefore opportunity to compare results, remains irrelevant. A European FP7 project is a good place to start gathering a critical mass of support (and consensus, in the process) for a new benchmark from the core group of benchmark designers in the joint work performed by the consortium. Since in LDBC multiple commercial graph and RDF vendors are on the table (Neo Technologies, Openlink, Ontotext and Sparsity) a minimal consensus on <strong>fairness</strong> had to be established immediately. The Linked Data Benchmark Council itself is a noncommercial, neutral, entity which releases all its benchmark specifications, software, as well as many materials created during the design. LDBC has spent a lot of time engaging interested parties (mainly through its <a href="https://ldbcouncil.org/tags/tuc-meeting/">Technical User Community gatherings</a>) as well as lining up additional organizations as members of the Linked Data Benchmark Council. There is, in other words, a strong non-technical, human factor in getting benchmarks accepted.</p> +<p>The need for <em>understandability</em> for me means that a database benchmark should consist of a limited number of queries and result metrics. Hence I find TPC-H with its 22 queries more understandable than TPC-DS with its 99, because after (quite some) study and experience it is possible to understand the underlying challnges of all queries in TPC-H. It may also be possible for TPC-DS but the amount of effort is just much larger. Understandable also means for me that a particular query should behave similarly, regardless of the query parameters. Often, a particular query needs to be executed many times, and in order not to play into the hands of simple query caching and also enlarge the access footprint of the workload, different query parameters should be used. However, parameters can strongly change the nature of a query but this is not desirable for the understandability of the workload. For instance, we know that TPC-H Q01 tests raw computation power, as its selection predicate eliminates almost nothing from the main fact table (LINEITEM), that it scans and aggregates into a small 4-tuple result. Using a selection parameter that would select only 0.1% of the data instead, would seriously change the nature of Q01, e.g. making it amendable to indexing. This stability of parameter bindings is an interesting challenge for the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) of LDBC which is not as uniform and uncorrelated as TPC-H. Addressing the challenge of obtaining parameter bindings that have similar execution characteristics will be the topic of a future blog post.</p> +<p>The <em>economical</em> aspect of benchmarking means that while rewarding high-end benchmark runs with higher scores, it is valuable if a meaningful run can also be done with small hardware. For this reason, it is good practice to use a performance-per-EURO (or $) metric, so small installations despite a lower absolute score can still do well on that metric. The economical aspect is right now hurting the (still) leading relational OLTP benchmark TPC-C. Its implementation rules are such that for higher reported rates of throughput, a higher number of warehouses (i.e. larger data size) is needed. In the current day and age of JIT-compiled machinecode SQL procedures and CPU-cache optimized main memory databases, the OLTP throughput numbers now obtainable on modern transactional systems like Hyper on even a single server (it reaches more than 100.000 transactions per second) are so high that they lead to petabyte storage requirements. Not only does this make TPC-C very expensive to run, just by the sheer amount of hardware needed according to the rules, but it also undermines it representativity, since OLTP data sizes encountered in the field are much smaller than OLAP data sizes and do not run in the petabytes.</p> +<p><em>Representative</em> benchmarks can be designed by studying or even directly using real workload information, e.g. query logs. A rigorous example of this is the <a href="http://aksw.org/Projects/DBPSB.html">DBpedia benchmark</a> whose workload is based on the query logs of dbpedia.org. However, this SPARQL endpoint is a single public Virtuoso instance that has been configured to interrupt all long running queries, such as to ensure the service remains responsive to as many users as possible. As a result, it is only practical to run small lookup queries on this database service, so the query log only contained solely such light queries. As a consequence, the DBpedia benchmark only tests small SPARQL queries that stress simple B-tree lookups only (and not joins, aggregations, path expressions or inference) and poses almost no technical challenges for either query optimization or execution. The lesson, thus, is to balance representativity with relevance (see later).</p> +<p>The fact that a benchmark can be <em>scaled</em> in size favors the use of synthetic data (i.e. created by a data generator) because data generators can produce any desired quantity of data. I hereby note that in this day and age, data generators should be parallel. Single-threaded single-machine data generation just becomes unbearable even at terabyte scales. A criticism of synthetic data is that it may not be representative of real data, which e.g. tends to contain highly correlated data with skewed distributions. This may be addressed to a certain extent by injecting specific skew and correlations into synthetic data as well (but: which skew and which correlations?). An alternative is to use real data and somehow blow up or contract the data. This is the approach in the mentioned DBpedia benchmark, though such scaling will distort the original distributions and correlations. Scaling a benchmark is very useful to investigate the effect of data size on the metric, on individual queries, or even in micro-benchmark tests that are not part of the official query set. Typically OLTP database benchmarks have queries whose complexity is O(log(N)) of the data size N, whereas OLAP benchmarks have queries which are linear, O(N) or at most O(N.log(N)) &ndash; otherwise executing the benchmark on large instances is infeasible. OLTP queries thus typically touch little data, in the order of log(N) tuples. In order not to measure fully cold query performance, OLTP benchmarks for that reason need a warmup phase with O(N/log(N)) queries in order to get the system into a representative state.</p> +<p>Now, what makes a benchmark <em>relevant</em>? In LDBC we think that benchmarks should be designed such that crucial areas of functionality are highlighted, and in turn system architects are stimulated to innovate. Either to catch up with competitors and bring the performance and functionality in line with the state-of-the-art but even to innovate and address technical challenges for which until now no good solutions exist, but which can give a decisive performance advantage in the benchmark. Inversely stated, benchmark design can thus be a powerful tool to influence the industry, as a benchmark design may set the agendas for multiple commercial design teams and database architects around the globe. To structure this design process, LDBC introduces the notion of <em>&ldquo;choke points&rdquo;</em>: by which we mean problems that challenge current technology. These choke points are collected and described early in the LDBC design process, and the workloads developed later are scored in terms of their coverage of relevant choke points. In case of graph data querying, one of the choke points that is unique to the area is recursive Top-N query handling (e.g. shortest path queries). Another choke point that arises is the impact of correlations between attribute value of graph nodes (e.g. both employed by TUM) and the connectivity degree between nodes (the probability to be friends). The notion observed in practice is that people who are direct colleagues, often are in each others friend network. A query that selects people in a social graph that work for the same company, and then does a friendship traversal, may get a bad intermediate result size estimates and therefore suboptimal query plan, if optimizers remain unaware of value/structure correlations. So this is an area of functionality that the Social Network Benchmark (SNB) by LDBC will test.</p> +<p>To illustrate what choke points are in more depth, we wrote a <a href="https://ldbcouncil.org/docs/papers/tpc-h-analyzed-choke-points-tpctc2013.pdf">paper in the TPCTC 2013</a> conference that performs a post-mortem analysis of TPC-H and identified 28 such choke points. <em><a href="chokepoints.png">This table</a></em> lists them all, grouped into six Choke Point (CP) areas (CP1 Agregation, CP2 Join, CP3 Locality, CP4 Calculations, CP5 Subqueries and CP6 Parallelism). The classification also shows CP coverage over each of the 22 TPC-H queries (black is high impact, white is no impact):</p> +<p>I would recommend reading this paper to anyone who is interested in improving the TPC-H score of a relational database system, since this paper contains the collected experience of three database architects who have worked with TPC-H at length: Orri Erling (of Virtuoso), Thomas Neumann (Hyper,RDF-3X), and me (MonetDB,Vectorwise). Recently Orri Erling showed that this paper is not complete as he discovered one more choke-point area for TPC-H: Top-N pushdown. In a detailed blog entry, Orri shows how this technique can <a href="http://www.openlinksw.com/weblog/oerling/?id=1779">trivialize Q18</a>; and this optimization can single handedly improve the overall TPC-score by 10-15%. This is also a lesson for LDBC: even though we design benchmarks with choke points in mind, the queries themselves may bring to light unforeseen opportunities and choke-points that may give rise to yet unknown innovations.</p> +<p>LDBC has just published two benchmarks as Public Drafts, which essentially means that you are cordially invited to download and try out the RDF-focused Semantic Publishing Benchmark <a href="https://ldbcouncil.org/developer/spb">(SPB)</a> and the more graph-focused Social Network Benchmark (<a href="https://ldbcouncil.org/developer/snb">SNB</a>), and <a href="https://groups.google.com/forum/#!forum/ldbcouncil">tell us what you think</a>. Stay tuned for the coming detailed blog posts about these benchmarks, which will explain the graph and RDF processing choke-points that they test.</p> +<p><em>(for more posts from Peter Boncz, see also <a href="https://databasearchitects.blogspot.com">Database Architects</a>, a blog about data management challenges and techniques written by people who design and implement database systems)</em></p> + + + + + New Website Online LDBC Benchmarks Reach Public Draft + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/new-website-online-ldbc-benchmarks-reach-public-draft/ + <p>The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be expanded in a few weeks with a mix of read- and insert-queries. Also, query validation will be added later. Watch this blog for the announcements to come, as this will be a matter of weeks to add.</p> +<p>The Public Draft stage means that the initial software (data generator, query driver) work and an initial technical specification and documentation has been written. In other words, there is a testable version of the benchmark available for anyone who is interested. Public Draft status does not mean that the benchmark has been adopted yet, it rather means that LDBC has come closer to adopting them, but is now soliciting feedback from the users. The benchmarks will remain in this stage at least until October 6. On that date, LDBC is organizing its fifth <a href="https://ldbcouncil.org/event/fifth-tuc-meeting">Technical User Community meeting</a>. One of the themes for that meeting is collecting user feedback on the Public Drafts; which input will be used to either further evolve the benchmarks, or adopt them.</p> +<p>You can also see that we created a this new website and a new logo. This website is different from <code>http://ldbc.eu</code> that describes the EU project which kick-starts LDBC. The ldbcouncil.org is a website maintained by the Linked Data Benchmark Council legal entity, which will live on after the EU project stops (in less than a year). The Linked Data Benchmark Council is an independent, impartial, member-sustained organization dedicated to the creation of RDF and graph data management benchmarks and benchmark practices.</p> +<p>In the next weeks, you will see many contributors in LDBC post items on this blog. Some of these blog entries will be very technical, others not, but all aim to explain what LDBC is doing for RDF and graph benchmarking, and why.</p> + + + + + Social Network Benchmark Goals + https://ldbcouncil.org/post/social-network-benchmark-goals/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/social-network-benchmark-goals/ + <p>Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established for the data attributes they represent, and queries such as “find the friend of a specified person who has long worked in a company in a specified country” are natural for the users and easy to understand from a functional point of view.</p> +<p>From a totally different perspective, Social Networks are challenging technologically, being part of the Big Data arena, and require the execution of queries that involve complex relationship search and data traversal computations that turn out to be choke points for the data management solutions in the market.</p> +<p>With the objective of shaping a benchmark which is up to date as a use case, well understood by everybody and poses significant technological challenges, the LDBC consortium decided to create the Social Network Benchmark, <a href="https://ldbcouncil.org/benchmarks/snb">SNB</a>, which is eventually going to include three workloads: the Interactive, the Business Intelligence and the Analytical. Those workloads are going to share a unique synthetic data generation tool that will mimic the data managed by real Social Networks.</p> +<p>The SNB data generator created by LDBC is an evolution of the S3G2 data generator and can be found at the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">LDBC Github repository</a>. The data generator is unique because it generates data that contains realistic distributions and correlations among variables that were not taken into consideration before. It also allows generating large datasets because it uses a Hadoop based implementation to compute the complex data generated. The SNB data generator has already been used in different situations like the <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD programming contest 2014</a>.</p> +<p>The SNB presents the Interactive workload as first of a breed with the objective to resemble the queries that users may place to a Social Network portal. Those are a combination of read and write small queries that express the needs of a user who is interacting with her friends and connections through the Social Network. Queries like that explained above (Q12 in the workload) are examples that set up choke points like pattern recognition or full traversals.</p> +<p>More details will be given in blogs to follow both for the data generator as well as for the specific characteristics of the workloads allowing the users to obtain a first contact with the benchmarks.</p> + + + + + Welcome to the New Industry Oriented LDBC Organisation for Benchmarking RDF and Graph Technologies + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + Tue, 14 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/welcome-to-the-new-industry-oriented-ldbc-organisation-for-benchmarking-rdf-and-graph-technologies/ + <p>It is with great pleasure that we announce the new LDBC organisation site at <a href="https://www.ldbcouncil.org">www.ldbcouncil.org</a>. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its interactive workload, which you will find in the <em>benchmarks</em> menu on this site.</p> +<p>Those benchmarks will allow all the actors in the RDF and Graph industry to know who is who and how the different technology players are reacting to the results of their competing industry companies. Thus, the users will have results to compare the technologies and vendors will have a clear idea of how their products evolve compared to other vendors, all with the objective to foster the technological growth of the RDF and Graph arena.</p> +<p>While the main objective of LDBC is to create benchmarks, we know that we need a strong community to grow and evolve those benchmarks taking into consideration all the market and technology needs. With this objective, we have created a special section to engage all the interested community through a blog, forums to discuss interesting issues and a lot of information on benchmarking, including links to other benchmarks, pointers to interesting conferences and venues and all the publications on benchmarking RDF and Graph technologies.</p> +<p>We want to make sure that we all know what benchmarking and the LDBC effort means, both historically, and from the global needs perspective. To make sure that this is accomplished, we set up a section open to the public with in depth explanations of the history of industry benchmarking, LDBC and why our society needs such efforts globally.</p> +<p>Finally, we want to invite you to our Fifth Technical Users Community (TUC) meeting to be held in Athens next Monday Oct. 6th 2014. This event will have as its main objective to allow for presentations on experiences with the two already released benchmarks, SNB and SPB. You’ll find updated information here.</p> +<p>In all, we expect that the LDBC organisation site engages all of you and that the growth of RDF and Graph technologies in the future is secured by the benchmarks fostered by us.</p> + + + + + 2nd International Workshop on Benchmarking RDF Systems + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/2nd-international-workshop-on-benchmarking-rdf-systems/ + <p>Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate discussions on best practices, different application needs and scenarios related to different aspects of RDF data management.</p> +<p>More at: <a href="http://events.sti2.at/bersys2014/">http://events.sti2.at/bersys2014/</a></p> + + + + + DATAGEN: Data Generation for the Social Network Benchmark + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/ + <p>As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 <a href="#references">[1]</a>.</p> +<p>One of the most important components of a benchmark is the dataset. However, directly using real data in a benchmark is not always possible. On the one hand, it is difficult to find data with all the scaling characteristics the benchmark requires. On the other hand, collecting real data can be expensive or simply not possible due to privacy concerns.</p> +<p>For these reasons, LDBC-SNB provides DATAGEN which is the synthetic data generator responsible for generating the datasets for the three LDBC-SNB workloads: the Interactive, the Business Intelligence and the Analytical. DATAGEN has been carefully designed with the following goals in mind:</p> +<ul> +<li><strong>Realism.</strong> The data generated by DATAGEN has to mimic the features of those found in a real social network. In DATAGEN, output attributes, cardinalities, correlations and distributions have been finely tuned to reproduce a real social network in each of its aspects. DATAGEN is aware of the data and link distributions found in a real social network such as Facebook <a href="#references">[2]</a>. Also, it uses real data from DBPedia, such as property dictionaries, which ensure that the content is realistic and correlated.</li> +<li><strong>Scalability.</strong> Since LDBC-SNB is targeting systems of different scales and budgets, DBGEN must be capable of generating datasets of different sizes, from a few Gigabytes to Terabytes. DATAGEN is implemented following the MapReduce paradigm, allowing for the generation of large datasets on commodity clusters.</li> +<li><strong>Determinism.</strong> DATAGEN is deterministic regardless of the number of cores/machines used to produce the data. This important feature guarantees that all Test Sponsors will face the same dataset, thus, making the comparisons between different systems fair and the benchmarks’ results reproducible.</li> +<li><strong>Usability.</strong> LDBC-SNB has been designed to have an affordable entry point. As such, DATAGEN has been severely influenced by this philosophy, and therefore it has been designed to be as easy to use as possible.</li> +</ul> +<p>Finally, the area of action of DATAGEN is not only limited to the scope of LDBC-SNB. Several researchers and practitioners are already using DATAGEN in a wide variety of situations. If you are interested on the internals and possibilities of DATAGEN, please visit its official repository (<a href="https://github.com/ldbc/ldbc_snb_datagen)">https://github.com/ldbc/ldbc_snb_datagen)</a>.</p> +<h4 id="references">References</h4> +<p>[1] Pham, Minh-Duc, Peter Boncz, and Orri Erling. &ldquo;S3g2: A scalable structure-correlated social graph generator.&rdquo; Selected Topics in Performance Evaluation and Benchmarking. Springer Berlin Heidelberg, 2013. 156-172.</p> +<p>[2] Prat-Pérez, Arnau, and David Dominguez-Sal. &ldquo;How community-like is the structure of synthetically generated graphs?.&rdquo; Proceedings of Workshop on GRAph Data management Experiences and Systems. ACM, 2014.</p> + + + + + Getting Started With SNB + https://ldbcouncil.org/post/getting-started-with-snb/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/getting-started-with-snb/ + <p>In a previous blog post titled &ldquo;<a href="https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/">Is SNB like Facebook&rsquo;s LinkBench?</a>&rdquo;, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.</p> +<h3 id="datagen">DATAGEN</h3> +<p>DATAGEN is the data generator used by all the workloads of SNB. <a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark/">Here</a> we introduced the design goals that drive the development of DATAGEN, which can be summarized as: <em>Realism, Scalability, Determinism and Usability.</em></p> +<p>DATAGEN produces datasets with the following schema, in terms of entities and their relations. Data generated represents a snapshot of the activity of a social network similar to real social networks such as Facebook, during a period of time. Data includes entities such as Persons, Organizations, and Places. The schema also models the way persons interact, by means of the friendship relations established with other persons, and the sharing of content such as messages (both textual and images), replies to messages and likes to messages. People form groups to talk about specific topics, which are represented as tags.</p> +<p><img src="schema.png" alt="image"></p> +<p>For the sake of credibility, data produced by DATAGEN has to be realistic. In this sense, data produced by DATAGEN not only has a realistic schema, but also pays attention to the following items:</p> +<ul> +<li> +<p>Realistic distributions. The degree distribution of friendship relationships has been modeled to reproduce that found in the Facebook graph. Also, other distributions such as the number of replies to a post, the number of persons per country or the popularity of a tag has been realistically modeled either using known distributions or data extracted from real sources such as Dbpedia.</p> +</li> +<li> +<p>Correlated attributes and relations. Attribute values are not chosen at random, but follow correlations. For instance, people from a specific country have a larger probability to have names typical from that country, to work on companies from that country or to study at universities of that country. Also, we DATAGEN implements a relationship creation process that tries to reproduce the homophily principle, that is, people with similar characteristics tend to be connected.</p> +</li> +</ul> +<p>DATAGEN is built on top of Hadoop, to generate datasets of different sizes. It works either on single node SMP machines or a cluster environment. DATAGEN supports different output formats targeting different systems. On the one hand, we have the CSV format, where each entity and relation is output into a different comma separated value file. On the other hand, it also supports the Turtle format for RDF systems.</p> +<p>Finally, DATAGEN outputs two other things:</p> +<ul> +<li> +<p>Update Streams, which will be used in the future to implement updates in the workloads.</p> +</li> +<li> +<p>Substitution parameters, which are the parameters of the query instances the LDBC driver will issue. These are select so the query plans of the resulting query executions do not differ significantly.</p> +</li> +</ul> +<p>Configuring and using DATAGEN is easy. Please visit <a href="https://github.com/ldbc/ldbc_snb_datagen">this page</a> for more information.</p> +<h3 id="ldbc-driver">LDBC driver</h3> +<p>SNB is designed to be as easier to adopt as possible. Therefore, SNB provides the LDBC execution driver, which is designed to automatically generated the benchmark workload and gather the benchmark results. It then generates a stream of operations in conformance with a workload definition, and executes those operations against some system using the provided database connector, and with the substitution parameters produced by DATAGEN. During execution, the driver continuously measures performance metrics, then upon completion it generates a report of those metrics.</p> +<p>It is capable of generating parallel workloads (e.g. concurrent reads and writes), while respecting the configured operation mix and ensuring that ordering between dependent operations is maintained. For further details on how the driver achieves that, please visit the Documentation <a href="https://github.com/ldbc/ldbc_driver/wiki">page</a>.</p> +<p>The test sponsor (aka the implementer of the benchmark), has to provide a set of implemented interfaces, that form a benchmark implementation to plug into the driver, and then the benchmark is automatically executed.</p> +<p>Given a workload consisting of a series of <em>Operations</em>, the test sponsor implements <em>OperationHandlers</em> __ for them. <em>OperationHandlers</em> are responsible of executing instances of an specific operation (query) type. This is done by overriding the method <em>executeOperation</em>(), which receives as input parameter an <em>Operation</em> instance and returns the result. From <em>Operation</em> __ instance, the operation&rsquo;s input parameters can be retrieved, as well as the database connection state.</p> +<p>The database connector is used to initialize, cleanup and get the database connection state. The database connector must implement the <em>Db</em> interface, which consists of three methods: <em>onInit</em>(), <em>onCleanup</em>() and <em>getConnectionState</em>(). <em>onInit</em>() is called before the benchmark is executed, and is responsible of initializing the database and registering the different <em>OperationHandlers</em>. <em>onCleanup</em>() is called after the benchmark has completed. Any resources that need to be released should be released here.</p> +<p>Finally, <em>getConnectionState</em>() returns an instance of <em>DbConnectionState</em>, which encapsulates any state that needs to be shared between <em>OperationHandler</em> instances. For instance, this state could contain the necessary classes used to execute a given query for the implementing system.</p> +<p>A good example on how to implement the benchmark can be found <a href="https://github.com/ldbc/ldbc_driver/wiki/Implementing%20a%20Database%20Connector">here</a>.</p> +<h3 id="workloads">Workloads</h3> +<p>Currently, LDBC has only released the first draft of the Interactive workload, but the business intelligence and analytical workloads are on the works. Workloads are designed to mimic the different usage scenarios found in operating a real social network site, and each of them targets one or more types of systems. Each workload defines a set of queries and query mixes, designed to stress the systems under test in different choke-point areas, while being credible and realistic.</p> +<p>Interactive workload reproduces the interaction between the users of the social network by including lookups and transactions that update small portions of the data base. These queries are designed to be interactive and target systems capable of responding such queries with low latency for multiple concurrent users. Examples of Interactive queries are, given a user, retrieve those friends with a specific name, or finding the most recent post and comments created by your friends.</p> +<p>Business Intelligence workload, will represent those business intelligence analytics a social network company would like to perform in the social network, in order to take advantage of the data to discover new business opportunities. This workload will explore moderate portions of data from different entities, and will perform more complex and data intensive operations compared to the Interactive ones.</p> +<p>Examples of possible Business Intelligence queries could be finding trending topics in country in a given moment, or looking for fraudulent “likers”.</p> +<p>Finally, the Analytical workload will aim at exploring the characteristics of the underlying structure of the network. Shortest paths, community detection or centrality, are representative queries of this workload, and will imply touching a vast amount of the dataset.</p> +<h3 id="final-remarks">Final remarks</h3> +<p>This is just a quick overview of the SNB benchmark. For a more detailed description, do not hesitate to read the official SNB specification <a href="https://github.com/ldbc/ldbc_snb_docs">draft</a>, and stay tunned to the LDBC blog for future blog posts detailing all of the SNB parts in depth.</p> + + + + + Introducing SNB Interactive, the LDBC Social Network Benchmark Online Workload + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/introducing-snb-interactive-the-ldbc-social-network-benchmark-online-workload/ + <p>The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.</p> +<p>The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user&rsquo;s social environment and potentially access data associated with the friends or a user and their friends.</p> +<p>This is representative of an operational application. This goes beyond OLTP (On Line Transaction Processing) by having substantially more complex queries touching much more data than the point lookups and short reports in TPC-C or E. The emphasis is presenting a rich and timely view of a constantly changing environment.</p> +<p>SNB Interactive gives end users and application developers a reference workload for comparing the relative merits of different technologies for graph data management. These range from dedicated graph databases to RDF stores and relational databases. There are graph serving benchmarks such as the Facebook Linkbench but SMB Interactive goes well beyond this in richness of schema and queries.</p> +<p>The challenge to implementors is handling the user facing logic of a social network in a single system as the scale increases. The present practice in large social networks is massive sharding and use of different SQL and key value stores for different aspects of the service. The SNB workload is not intended to replicate this situation but to look for ways forward, so that one system can keep up with transactions and offer user rich and varied insight into their environment. The present practice relies on massive precomputation but SNB interactive seeks more agility and adhoc capability also on the operational side.</p> +<p>The dataset is scaled in buckets, with distinct scales for 10, 30, 100, 300GB and so forth. A 100GB dataset has approximately 500,000 simulated users with their connections and online history. This is a convenient low-end single server size while 500 million users is 100TB, which is a data center scale requiring significant scale-out.</p> +<p>The metric is operations per minute at scale. Online benchmarks typically have a fixed ratio between throughput and dataset size. Here we depart from this, thus one can report arbitrarily high throughputs at any scale. This makes main memory approaches feasible, which corresponds to present online practices. The benchmark makes transactions and queries on a simulated timeline of social interactions. The challenge for the systm is to run this as fast as possible at the selected scale while providing fast and predictable response times. Throughput can be increased at the cost of latency but here the system must satisfy response time criteria while running at the reported throughput.</p> +<p>Different technologies can be used for implementing SNB interactive. The workload is defined in natural language with sample implementations in SPARQL and Cypher. Other possibilities include SQL and graph database API&rsquo;s.</p> +<p>SNB Interactive is an example of LDBC&rsquo;s choke point driven design methodology, where we draw on the combined knowledge and experience of several database system architects for defining realistic, yet ambitious challenges whose solution will advance the state of the art</p> +<p>The benchmark specification and associated tools are now offered for public feedback. The LDBC partners working on SNB nteractive will provide sample implementations of the workload on their systems, including Virtuoso, Neo4J and Sparsity. Specifics of availability and coverage may vary.</p> +<p>Subsequent posts will address the workload in more detail.</p> + + + + + Is SNB Like Facebooks LinkBench + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/is-snb-like-facebooks-linkbench/ + <p>In this post, I will discuss in some detail the rationale and goals of the design of the <a href="https://ldbcouncil.org/benchmarks/snb">Social Network Benchmark</a> (SNB) and explain how it relates to real social network data as in Facebook, and in particular FaceBook&rsquo;s own graph benchmark called <a href="https://www.facebook.com/notes/facebook-engineering/linkbench-a-database-benchmark-for-the-social-graph/10151391496443920">LinkBench</a>. We think SNB is the most intricate graph database benchmark to date (it&rsquo;s also available in RDF!), that already has made some waves. SNB recently received praise at the most important database systems conference <a href="http://www.sigmod2014.org/">SIGMOD in Snowbird</a> after being used for this year&rsquo;s <a href="https://arxiv.org/pdf/2010.12243.pdf">ACM SIGMOD Programming Contest</a>, which was about graph analytics.</p> +<p>SNB is intended to provide the following <strong>value</strong> to different stakeholders:</p> +<ul> +<li> +<p>For end users facing graph processing tasks, SNB provides a recognizable scenario against which it is possible to <em>compare merits of different products</em> and technologies. By covering a wide variety of scales and price points, SNB can serve as an aid to technology selection.</p> +</li> +<li> +<p>For vendors of graph database technology, SNB provides a <em>checklist of features</em> and performance characteristics that helps in product positioning and can serve to guide new development.</p> +</li> +<li> +<p>For researchers, both industrial and academic, the SNB dataset and workload provide <em>interesting challenges</em> in multiple technical areas, such as query optimization, (distributed) graph analysis, transactional throughput, and provides a way to objectively compare the effectiveness and efficiency of new and existing technology in these areas.</p> +</li> +</ul> +<p>I should clarify that even though the data model of SNB resembles Facebook (and we&rsquo;re extending it to also look more like Twitter), the goal of SNB is not to advise Facebook or Twitter what systems to use, they don&rsquo;t need LDBC for that. Rather, we take social network data as a model for the much more broader graph data management problems that IT practitioners face. The particular characteristic of a graph data management problem is that the queries and analysis is not just about finding data by value, but about learning about the <em>connection patterns</em> between data. The scenario of the SNB, a social network, was chosen with the following goals in mind:</p> +<ul> +<li> +<p>the benchmark scenario should be <strong>understandable</strong> to a large audience, and this audience should also understand the relevance of managing such data.</p> +</li> +<li> +<p>the scenario in the benchmark should cover the complete range of challenges <strong>relevant</strong> for graph data management, according to the benchmark scope.</p> +</li> +<li> +<p>the query challenges in it should be <strong>realistic</strong> in the sense that, though synthetic, similar data and workloads are encountered in practice.</p> +</li> +</ul> +<p>The SNB is in fact three distinct benchmarks with a common dataset, since there are <em>three different workloads</em>. Each workload produces a single metric for performance at the given scale and a price/performance metric at the scale. The full disclosure further breaks down the composition of the metric into its constituent parts, e.g. single query execution times.</p> +<ul> +<li> +<p><strong>Interactive Workload.</strong> The Interactive SNB workload is the first one we are releasing. It is defined in plain text, yet we have example implementations in Neo4j&rsquo;s Cypher, SPARQL and SQL. The interactive workloads tests a system&rsquo;s throughput with relatively simple queries with concurrent updates. The system under test (SUT) is expected to run in a steady state, providing durable storage with smooth response times. Inserts are typically small, affecting a few nodes at a time, e.g. uploading of a post and its tags. Transactions may require serializability, e.g. verifying that something does not exist before committing the transaction. Reads do not typically require more than read committed isolation. One could call the Interactive Workload an OLTP workload, but while queries typically touch a small fraction of the database, this can still be up to hundreds of thousands of values (the two-step neighborhood of a person in the social graph, often). Note that in order to support the read-queries, there is a lot of liberty to create indexing structures or materialized views, however such structures need to be maintained with regards to the continues inserts that also part of the workload. This workload is now in draft stage, which means that the <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a> and <a href="https://github.com/ldbc/ldbc_driver">driver software stack</a> are ready and the purpose is to obtain user feedback, as well as develop good system implementations. The first implementations of this workload are now running on Openlink Virtuoso, Neo4j and Sparsity Sparksee, and we are eager to see people try these, and optimize and involve these.</p> +</li> +<li> +<p><strong>Business Intelligence Workload.</strong> There is a first stab at this workload formulated in SPARQL, tested against Openlink Virtuoso. The BI workload consists of complex structured queries for analyzing online behavior of users for marketing purposes. The workload stresses query execution and optimization. Queries typically touch a large fraction of the data and do not require repeatable read. The queries will be concurrent with trickle load (not out yet). Unlike the interactive workload, the queries touch more data as the database grows.</p> +</li> +<li> +<p><strong>Graph Analytics Workload.</strong> This workload is not yet available. It will test the functionality and scalability of the SUT for graph analytics that typically cannot be expressed in a query language. As such it is the natural domain for graph programming frameworks like Giraph. The workload is still under development, but will consist of algorithms like PageRank, Clustering and Breadth First Search. The analytics is done on most of the data in the graph as a single operation. The analysis itself produces large intermediate results. The analysis is not expected to be transactional or to have isolation from possible concurrent updates.</p> +</li> +</ul> +<p>All the SNB scenarios share a common scalable synthetic data set, generated by a state-of-the art <a href="https://github.com/ldbc/ldbc_socialnet_bm/tree/master/ldbc_socialnet_dbgen">data generator</a>. We strongly believe in a single dataset that makes sense for all workloads, that is, the interactive and BI workloads will traverse data that has sensible PageRank outcomes, and graph clustering structure, etc. This is in contrast to <a href="http://people.cs.uchicago.edu/~tga/pubs/sigmod-linkbench-2013.pdf">LinkBench</a>, released by the team of Facebook that manages the OLTP workload on the Facebook Graph, which closely tunes to the <strong>low-level</strong> MySQL query patterns Facebook sees, but whose graph structure does not attempt to be realistic beyond average out degree of the nodes (so, it makes no attempts to create realistic community patterns or correlations) . The authors of LinkBench may be right that the graph structure does not make a difference for simple insert/update/delete/lookup actions which LinkBench itself tests, but for the SNB queries in the Interactive and BI workloads this is not true. Note that <a href="http://borthakur.com/ftp/sigmod2013.pdf">Facebook&rsquo;s IT infrastructure</a> does not store all user data in MySQL and its modified memcached (&quot;<a href="http://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/11730-atc13-bronson.pdf">TAO</a>&quot;), some of it ends up in separate subsystems (using HDFS and HBase), which is outside of the scope of LinkBench. However, for queries like in the SNB Interactive and BI workloads it <strong>does</strong> matter how people are connected, and how the attribute values of connected people correlate. In fact, the SNB data generator is unique in that it generates a huge graph with <em>correlations</em>, where people who live together, have the same interests or work for the same company have greater chance to be connected, and people from Germany have mostly German names, etc. Correlations frequently occur in practice and can strongly influence the quality of query optimization and execution, therefore LDBC wants to test their effects on graph data management systems (the impact of correlation among values and structure on query optimization and execution are a &ldquo;choke point&rdquo; for graph data management system where LDBC wants to stimulate innovation).</p> + + + + + Making It Interactive + https://ldbcouncil.org/post/making-it-interactive/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/making-it-interactive/ + <p><em>Synopsis:</em> Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.</p> +<p>It is about to be showtime for LDBC. The initial installment of the LDBC Social Network Benchmark (SNB) is the full data generator, test driver, workload and reference implementation for the interactive workload. SNB will further acquire business intelligence and graph analytics workloads but this post is about the interactive workload.</p> +<p>As part of finalizing the interactive workload, we need to determine precise mixes of the component queries and updates. We note that the interactive mix so far consists of very heavy queries. These touch, depending on the scale upwards of a million entities in the database.</p> +<p>Now, rendering a page view in a social network site does not touch millions of entities. The query that needs to be correct and up to date touches tens or hundreds of entities, e.g. posts or social connections for a single page impression. There are also statistical views like the count of people within so many steps or contact recommendations but these are not real time and not recalculated each time they are shown.</p> +<p>So, LDBC SNB has a twofold task:</p> +<ol> +<li>In order to be a credible interactive workload, it must in fact have characteristics of one</li> +<li>In order to stimulate progress it must have queries that are harder than those that go in routine page views but are still not database-wide analytics.</li> +</ol> +<p>Designing a workload presents specific challenges:</p> +<ol> +<li>The workload must be realistic enough for users to identify with it.</li> +<li>The workload must pose challenges and drive innovation in a useful direction.</li> +<li>The component operations must all play a noticeable role in it. If the operation&rsquo;s relative performance doe does not affect the score, why is it in the workload?</li> +</ol> +<p>The interactive mix now has 14 queries that are interesting from a query optimization and execution viewpoint but touch millions of entities. This is not what drives page inpressions in online sites. Many users of GDB and RDF are about online sites, so this aspect must not be ignored.</p> +<p>Very roughly, the choke points (technical challenges) of SNB interactive are as follows:</p> +<ul> +<li>Random access - Traversing between people, content makes large numbers of random lookups. These can be variously parallelized and/or vectored.</li> +<li>Query optmization must produce right plans - The primary point isjoin order and join type. Index vs. hash based joins have very different performance properties and the right choice depends on corectly guessing the number of rows and of distinct keys on either side of the join.</li> +<li>When doing updates and lookups, the execution plan is obvious but there the choke point is the scheduling of large numbers of short operations.</li> +<li>Many queries have aggregation, many have distinct, all have result ordering and a limit on result count. The diverse interactions of these operators produce optimization opportunities.</li> +</ul> +<p>Dreaming up a scenario and workload is not enough for a benchmark. There must also be a strong indication that the job is do-able and plausible in the scenario.</p> +<p>In online benchmarks different operations have different frequencies and the operations are repeated large numbers of times. There is a notion of steady state, so that the reported result represents a level of performance a system can sustain indefinitely.</p> +<p>A key part of the workload definition is the workload mix, i.e. the relative frequencies of the operations. This decides in fact what the benchmark measures.</p> +<p>The other aspect is the metric, typically some variation on operations per unit of time.</p> +<p>All these are interrelated. Here we can take clicks per second as a metric, which is easy to understand. We wish to avoid the pitfall of TPC-C which ties the metric to a data size, so that for a high metric one must have a correspondingly larger database. This rule makes memory-only implementations in practice unworkable, while in reality many online systems in fact run from memory. So, here we scale in buckets, like in TPC-H but we still have an online workload. The scenario of the benchmark has its own timeline, here called simulation time. A benchmark run produces events in the simulation time but takes place in real time. This defines an accelration ratio. For example we could say that a system does 1000 operations per second at 300G scale, with an acceleration of 7x, i.e. 7 hours worth of simulation time are done in one hour of real time. A metric of this form is directly understandable for sizing a system, as long as the workload mix is realistic. We note that online sites usually are provisioned so that servers do not run anywhere near their peak throughput at a busy time.</p> +<p>So how to define the actual mix? By measuring. But measuring requires a reference implementation that is generally up to date for the database science of the time and where the individual workload pieces are implemented in a reasonable manner, so no bad query plans or bad schema design. For the reference implementation, we use Virtuoso column store in SQL.</p> +<p>But SQL is not graphy! Why not SPARQL? Because SPARQL has diverse fixed overheads and this is not a RDF-only workload. We do not want SPARQL overheads to bias the metric, we just want an implementation where we know exactly what goes on and how it works, with control of physical data placement so we know there are no obvious stupidities in any of this. SPARQL will come. Anyway, as said elsewhere, we believe that SPARQL will outgrow its overheads, at which point SQL or SPARQL is a matter of esthetic preference. For now, it is SQL and all we want is transparency into the metal.</p> +<p>Having this, we peg the operation mix to the update stream generated by the data generator. At the 30G scale, there are 3.5M new posts/replies per month of simulation time. For each such, a query mix will be run, so as to establish a realistic read/write ratio. The query mix will have fractional queries, for example 0.2 friends recommendations per new post, but that is not a problem, since we run large numbers of these and at the end of the run can check that the ratios of counts are as expected. Next, we run this as fast as it will go on the test system. Then we adjust the ratio of short and long queries to get two objectives:</p> +<ul> +<li>Short queries should collectively be about 45% of the CPU load.</li> +<li>Updates will be under 5%</li> +<li>Long queries will take up the rest. For long queries, we further tune the relative frequencies so that each represents a roughly equal slice of the time. Having a query that does not influence the metric is useless, so each gets enough showtime to have an impact but by their nature some are longer than others.</li> +</ul> +<p>The reason why short queries should have a large slice is the fact that this is so in real interactive systems. The reason why long queries are important is driving innovation. Like this we get both scheduling (short lookup/update) and optimization choke points covered. As a bonus be make the mix so that we get a high metric, so many clicks per second, since this is what the operator of an online site wants.</p> +<p>There is a further catch: Different scales have different degrees of the friends graph and this will have a different influence on different queries. To see whether this twists the metric out of shape we must experiment. For example, one must not have ogarithmic and linear complexity queries in the same mix, as BSBM for example has. So this is to be kept in mind as we proceed.</p> +<p>In the next post we will look at the actual mix and execution times on the test system.</p> + + + + + SNB Data Generator - Getting Started + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/snb-data-generator-getting-started/ + <p>In previous posts (<a href="https://ldbcouncil.org/post/datagen-data-generation-for-the-social-network-benchmark">this</a> and <a href="https://ldbcouncil.org/post/getting-started-with-snb">this</a>) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.</p> +<h3 id="getting-and-configuring-hadoop">Getting and Configuring Hadoop</h3> +<p>DATAGEN runs on top of hadoop 1.2.1 to be scale. You can download it from here. Open a console and type the following commands to decompress hadoop into /home/user folder:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz hadoop-1.2.1.tar.gz +</span></span></code></pre></div><p>For simplicity, in this tutorial we will run DATAGEN in standalone mode, that is, only one machine will be used, using only one thread at a time to run the mappers and reducers. This is the default configuration, and therefore anything else needs to be done for configuring it. For other configurations, such as Pseudo-Distributed (multiple threads on a single node) or Distributed (a cluster machine), visit the <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/wiki/Configuration">LDBC DATAGEN wiki</a>.</p> +<h3 id="getting-and-configuring-datagen">Getting and configuring DATAGEN</h3> +<p>Before downloading DATAGEN, be sure to fulfill the following requirements:</p> +<ul> +<li>Linux based machine</li> +<li>java 1.6 or greater</li> +<li>python 2.7.X</li> +<li>maven 3</li> +</ul> +<p>After configuring hadoop, now is the time to get DATAGEN from the LDBC-SNB official repositories. Always download the latest release, which at this time is v0.1.2. Releases page is be found <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/releases">here</a>. Again, decompress the downloaded file with the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user +</span></span><span style="display:flex;"><span>$ tar xvfz ldbc_snb_datagen-0.1.2.tar.gz +</span></span></code></pre></div><p>This will create a folder called “ldbc_snb_datagen-0.1.2”.</p> +<p>DATAGEN provides a <em>run.sh</em> is a script to automate the compilation and execution of DATAGEN. It needs to be configured for your environment, so open it and set the two variables at the top of the script to the corresponding paths.</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>HADOOP_HOME<span style="color:#f92672">=</span>/home/user/hadoop-1.2.1 +</span></span><span style="display:flex;"><span>LDBC_SNB_DATAGEN_HOME<span style="color:#f92672">=</span>/home/user/ldbc_snb_datagen +</span></span></code></pre></div><p>HADOOP_HOME points to the path where hadoop-1.2.1 is installed, while LDBC_SNB_DATAGEN_HOME points to where DATAGEN is installed. Change these variables to the appropriate values. Now, we can execute <em>run.sh</em> script to compile and execute DATAGEN using default parameters. Type the following commands:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-bash" data-lang="bash"><span style="display:flex;"><span>$ cd /home/user/ldbc_snb_datagen-0.1.2 +</span></span><span style="display:flex;"><span>$ ./run.sh +</span></span></code></pre></div><p>This will run DATAGEN, and two folders will be created at the same directory: <em>social_network</em> containing the scale factor 1 dataset with csv uncompressed files, and <em>substitution_parameters</em> containing the substituion parameters needed by the driver to execute the benchmark.</p> +<h3 id="changing-the-generated-dataset">Changing the generated dataset</h3> +<p>The characteristics of the dataset to be generated are specified in the <em>params.ini</em> file. By default, this file has the following content:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:1</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:1</span> +</span></span></code></pre></div><p>The following is the list of options and their default values supported by DATAGEN:</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>scaleFactor</td> +<td>1</td> +<td>&ldquo;The scale factor of the data to generate. Possible values are: 1, 3, 10, 30, 100, 300 and 1000&rdquo;</td> +</tr> +<tr> +<td>serializer</td> +<td>csv</td> +<td>&ldquo;The format of the output data. Options are: csv, csv_merge_foreign, ttl&rdquo;</td> +</tr> +<tr> +<td>compressed</td> +<td>FALSE</td> +<td>Specifies to compress the output data in gzip.</td> +</tr> +<tr> +<td>outputDir</td> +<td>./</td> +<td>Specifies the folder to output the data.</td> +</tr> +<tr> +<td>updateStreams</td> +<td>FALSE</td> +<td>&ldquo;Specifies to generate the update streams of the network. If set to false, then the update portion of the network is output as static&rdquo;</td> +</tr> +<tr> +<td>numThreads</td> +<td>1</td> +<td>Sets the number of threads to use. Only works for pseudo-distributed mode</td> +</tr> +</tbody> +</table> +<p>For instance, a possible <em>params.ini</em> file could be the following:</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">scaleFactor:30</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:ttl</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For those not interested on generating a dataset for a given predefined scale factor, but for other applications, the following parameters can be specified (they need to be specified all together):</p> +<table> +<thead> +<tr> +<th>Option</th> +<th>Default value</th> +<th>Description</th> +</tr> +</thead> +<tbody> +<tr> +<td>numPersons</td> +<td>-</td> +<td>The number of persons to generate</td> +</tr> +<tr> +<td>numYears</td> +<td>-</td> +<td>The amount of years of activity</td> +</tr> +<tr> +<td>startYear</td> +<td>-</td> +<td>The start year of simulation.</td> +</tr> +</tbody> +</table> +<p>The following is an example of another possible <em>params.ini</em> file</p> +<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-ini" data-lang="ini"><span style="display:flex;"><span><span style="color:#a6e22e">numPersons:100000</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numYears:3</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">startYear:2010</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">serializer:csv_merge_foreign</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">compressed:false</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">updateStreams:true</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">outputDir:/home/user/output</span> +</span></span><span style="display:flex;"><span><span style="color:#a6e22e">numThreads:4</span> +</span></span></code></pre></div><p>For more information about the schema of the generated data, the different scale factors and serializers, please visit the wiki page of DATAGEN at <a href="https://github.com/ldbc/ldbc_snb_datagen_hadoop/">GitHub</a>!</p> + + + + + The Day of Graph Analytics + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + Thu, 09 Oct 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/the-day-of-graph-analytics/ + <p><em>Note: consider this post as a continuation of the &ldquo;<a href="https://ldbcouncil.org/post/making-it-interactive">Making it interactive</a>&rdquo; post by Orri Erling.</em></p> +<p>I have now completed the <a href="https://github.com/openlink/virtuoso-opensource">Virtuoso</a> TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. <a href="http://www.tpc.org/tpch/">TPC-H</a> is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, as it were, the cornerstone of the true science. This is however not the totality of it. From the LDBC angle, we might liken this to the last camp before attempting a mountain peak.</p> +<p>So, we may now seriously turn to graph analytics. The project has enough left to run in order to get a good BI and graph analytics workload. In LDBC in general, as in the following, BI or business intelligence means complex analytical queries. Graph analytics means graph algorithms that are typically done in graph programming frameworks or libraries.</p> +<p>The BI part is like TPC-H, except for adding the following challenges:</p> +<ul> +<li> +<p>Joins of derived tables with group by, e.g. comparing popularity of items on consecutive time periods.</p> +</li> +<li> +<p>Transitive dimensions - A geographical or tag hierarchy can be seen as a dimension table. To get the star schema plan with the selective hash join, the count of the transitive traversal of the hierarchy (hash build side) must be correctly guessed.</p> +</li> +<li> +<p>Transitivity in fact table, i.e. average length of reply thread. There the cost model must figure that the reply link is much too high cardinality for hash build side, besides a transitive operation is not a good candidate for a build in multiple passes, hence the plan will have to be by index.</p> +</li> +<li> +<p>Graph traversal with condition on end point and navigation step. The hierarchical dimensions and reply threads are in fact trees, the social graph is not. Again the system must know some properties of connectedness (in/out degree, count of vertices) to guess a traversal fanout. This dictates the join type in the step (hash or index). An example is a transitive closure with steps satisfying a condition, e.g. all connected persons have a specific clearance.</p> +</li> +<li> +<p>Running one query with parameters from different buckets, implying different best plan.</p> +</li> +<li> +<p>Data correlations, e.g. high selectivity arising from two interests seldom occurring together, in places where the correct estimation makes the difference between a good and a bad plan.</p> +</li> +<li> +<p>Large intermediate results stored in tables, as in materializing complex summaries of data for use in follow up queries.</p> +</li> +<li> +<p>More unions and outer joins.</p> +</li> +</ul> +<p>The idea is to cover the base competences the world has come to expect and to build in challenges to last another 10-15 years.</p> +<p>For rules and metric, we can use the TPC-H or <a href="http://www.tpc.org/tpcds/default.asp">TPC-DS</a> ones as a template. The schema may differ from an implementation of the interactive workload, as these things would normally run on different systems anyway. As another activity that is not directly LDBC, I will do a merge of SNB and <a href="http://www.openstreetmap.org/">Open Street Map</a>. The geolocated things (persons, posts) will get real coordinates from their vicinity and diverse geo analytics will become possible. This is of some significant interest to Geoknow, another FP7 where OpenLink is participating.</p> +<p>Doing the BI mix and even optimizing the interactive part involves some redoing of the present support for transitivity in Virtuoso. The partitioned group by with some custom aggregates is the right tool for the job, with all parallelization, scale-out, etc ready. You see, TPC-H is very useful also in places one does not immediately associate with it.</p> +<p>As a matter of fact, this becomes a BSP (bulk synchronous processing) control structure. Run any number of steps, each item produces results/effects scattered across partitions. The output of the previous is the input of the next. We might say BSP is an attractor or &ldquo;Platonic&rdquo; control structure to which certain paths inevitably lead. Last year I did a BSP implementation in SQL, reading and writing tables and using transactions for serializable update of the border. This is possible but will not compete with a memory based framework and not enough of the optimization potential, e.g. message combining, is visible to the engine in this formulation. So, now we will get this right, as suggested.</p> +<p>So, the transitive derived table construct can have pluggable aggregations, e.g. remembering a path, a minimum length or such), reduction like a scalar-valued aggregate (min/max), different grouping sets like in a group by with cube or grouping sets, some group-by like reduction for message combining and so forth. If there is a gather phase that is not just the result of the scatter of the previous step, this can be expressed as an arbitrary database query, also cross partition in a scale-out setting.</p> +<p>The distributed/partitioned group by hash table will be a first class citizen, like a procedure scoped temporary table to facilitate returning multiple results and passing large data between multiple steps with different vertex operations, e.g. forward and backward in betweenness centrality.</p> +<p>This brings us to the graph analytics proper, which is often done in BSP style, e.g. <a href="http://es.slideshare.net/shatteredNirvana/pregel-a-system-for-largescale-graph-processing">Pregel</a>, <a href="http://giraph.apache.org">Giraph</a>, <a href="http://uzh.github.io/signal-collect/">Signal-Collect</a>, some but not all <a href="http://ppl.stanford.edu/main/green_marl.html">Green-Marl</a> applications. In fact, a Green-Marl back end for Virtuoso is conceivable, whether one will be made is a different matter.</p> +<p>With BSP in the database engine, a reference implementation of many standard algorithms is readily feasible and performant enough to do reasonable sizing for the workload and to have a metric. This could be edges or vertices per unit of time, across a mix of algorithms, for example. Some experimentation will be needed. The algorithms themselves may be had from the Green-Marl sample programs or other implementations. Among others, Oracle would presumably agree that this sort of functionality will in time migrate into core database. We will here have a go at this and along the way formulate some benchmark tasks for a graph analytics workload. Whenever feasible, this will derive from existing work such as <a href="http://graphbench.org/">graphbench.org</a> but will be adapted to the SNB dataset.</p> +<p>The analytics part will be done with more community outreach than the interactive one. I will blog about the business questions, queries and choke points as we go through them. The interested may pitch in as the matter comes up.</p> + + + + + Using LDBC SPB to Find OWLIM Performance Issues + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + Wed, 20 Aug 2014 00:00:00 +0000 + + https://ldbcouncil.org/post/using-ldbc-spb-to-find-owlim-performance-issues/ + <p>During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC <a href="https://ldbcouncil.org/developer/spb">Semantic Publishing Benchmark</a> (LDBC-SPB) as a part of our development and release process.</p> +<p>First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.</p> +<p>Initially we’ve decided to fix some of the benchmark parameters :</p> +<ul> +<li>the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup and benchmark run times - 60s and 600s respectively. * maximum number of Editorail Agents (E) : 2 (threads that will execute INSERT/UPDATE operations) * maximum number of Aggregation Agents (A) : 16 (threads that will execute SELECT operations) * generated data by the benchmark driver to be “freshly” deployed before each benchmark run - benchmark driver can be configured to generate the data and stop. We’re using that option and have a fresh copy of it put aside ready for each run.</li> +</ul> +<p>Having those parameters fixed, running LDBC-SPB is a straight-forward task. The hardware we’re using for benchmarking is a machine with 2 Intel Xeon CPUs, 8 cores each, 256 GB of memory and SSD storage, running Linux. Another piece of hardware we’ve tested with is a regular desktop machine with Intel i7, 32 GB of memory and HDD storage. During our experiments we have allowed a deviation in results of 5% to 10% because of the multi-threaded nature of the benchmark driver.</p> +<p>We’ve also decided to produce some benchmark results on Amazon’s EC2 Instances and compare with the results we’ve had so far. Starting with m3.2xlarge instance (8 vCPUs, 30GB of memory and 2x80GB SSD storage) on a 50M dataset we’ve achieved more than 50% lower results than ones on our own hardware. On a largrer Amazon Instance c3.4xlarge (16 vCPUs, 30GB of memory and doubled SSD storage) we’ve achieved the same performance in terms of aggregation operations and even worse performance in terms for editorial operations, which we give to the fact that Amazon instances are not providing consistent performance all the time.</p> +<p>Following two charts are showing how OWLIM performs on different hardware and with different configurations. They also give an indication of Amazon’s capabilities compared to the results achieved on a bare-metal hardware.</p> +<p><img src="16-2-Performance.png" alt="image"></p> +<p>Figure 1 : OWLIM Performance : 2 amazon instances and 2 local machines. 16 aggregation and 2 editorial agents running simultaneously. Aggregation and editorial operations displayed here should be considered independently, i.e. even though editorial opeartions graph shows higher results on Amazon m3.2xlarge instance, values are normalized and are referring to corresponding type of operation.</p> +<p><img src="8-0-Performance.png" alt="image"></p> +<p>Figure 2 : OWLIM Performance : 2 amazon instances and 2 local machines. 8 aggregation running simultaneously. Read-only mode.</p> +<p>Another thing that we’re using LDBC-SPB for is to monitor load performance speeds. Loading of generated data can be done either manually by creating some sort of a script (CURL), or by the benchmark driver itself which will execute a standard POST request against a provided SPARQL endpoint. Benchmark&rsquo;s data generator can be configured to produce chunks of generated data in various sizes, which can be used for exeperiments on load performance. Of course load times of forward-chaining reasoners can not be compared to backward-chaining ones which is not the goal of the benchmark. Loading performances is not measured “officially“ by LDBC-SPB (although time for loading the data is reported), but its good thing to have when comparing RDF Stores.</p> +<p>An additional and interesting feature of the SPB is the test for conformance to OWL2-RL rule-set. It is a part of the LDBC-SPB benchmark and that phase is called <em>checkConformance</em>. The phase is run independently of the benchmark phase itself. It requires no data generation or loading except the initial set of ontologies. It tests RDF store’s capabilities for conformance to the rules in OWL2-RL rule-set by executing a number of INSERT/ASK queries specific for each rule. The result of that phase is a list of all rules that have been passed or failed which is very useful for regression testing.</p> + + + + + Fourth TUC meeting + https://ldbcouncil.org/event/fourth-tuc-meeting/ + Thu, 03 Apr 2014 12:32:22 -0400 + + https://ldbcouncil.org/event/fourth-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the fourth Technical User Community (TUC) meeting.</p> +<p>This will be a one-day event at CWI in Amsterdam on <em>Thursday April 3, 2014</em>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<p><strong>For presenters please limit your talks to just 15 minutes</strong></p> +<h3 id="agenda">Agenda</h3> +<p><strong>April 3rd</strong></p> +<ul> +<li> +<p>10:00 Peter Boncz (VUA) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506371.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=JYWVgrP1kVY">video</a>: <em>LDBC project status update</em></p> +</li> +<li> +<p>10:20 Norbert Martinez (UPC) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506375.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=4yREJQ3yDr0">video</a>: <em>Status update on the LDBC Social Network Benchmark (SNB) task force</em>.</p> +</li> +<li> +<p>10:50 Alexandru Iosup (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506363.ppt">ppt</a>, <a href="https://www.youtube.com/watch?v=ulT-RFwKpOE">video</a>: <em>Towards Benchmarking Graph-Processing Platforms</em></p> +</li> +<li> +<p>11:10 Mike Bryant (Kings College) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506364.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=KiHRTu9xx0A">video</a>: <em>EHRI Project: Archival Integration with Neo4j</em></p> +</li> +</ul> +<p><strong>11:30 coffee</strong></p> +<ul> +<li> +<p>11:50 Thilo Muth (University of Magdeburg) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506369.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=5xH3UDLP6Oc">video</a>: <em>MetaProteomeAnalyzer: a graph database backed software for functional and taxonomic protein data analysis</em></p> +</li> +<li> +<p>12:10 Davy Suvee (Janssen Pharmaceutica / Johnson &amp; Johnson) – <a href="https://www.youtube.com/watch?v=XN3LRJUfJIU">video</a>: <em>Euretos Brain - Experiences on using a graph database to analyse data stored as a scientific knowledge graph</em></p> +</li> +<li> +<p>12:30 Yongming Luo (TU Eindhoven) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506366.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=g_my3tBB2_s">video</a>: <em>Regularities and dynamics in bisimulation reductions of big graphs</em></p> +</li> +<li> +<p>12:50 Christopher Davis (TU Delft) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506370.pdf">pdf</a>, <a href="https://www.youtube.com/channel/UC6HbzfJ4016Vez-2HKNeDag">video</a>: <em>Enipedia - Enipedia is an active exploration into the applications of wikis and the semantic web for energy and industry issues</em></p> +</li> +</ul> +<p><strong>13:10 - 14:30 lunch @ restaurant Polder</strong></p> +<ul> +<li> +<p>14:30 <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506365.pptx">SPB task force report</a></p> +</li> +<li> +<p>15:00 Bastiaan Bijl (Sysunite) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506373.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=TsCeKDHShMY">video</a>: <em>Using a semantic approach for monitoring applications in large engineering projects</em></p> +</li> +<li> +<p>15:20 Frans Knibbe (Geodan) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506372.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=uAX-m4OewPM">video</a>: <em>Benchmarks for geographical data</em></p> +</li> +<li> +<p>15:40 Armando Stellato (University of Rome, Tor Vergata &amp; UN Food and Agriculture Organization) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506374.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=mfA4csAs72Y">video</a>: <em>VocBench2.0, a Collaborative Environment for SKOS/SKOS-XL Management: scalability and (inter)operatibility challenges</em></p> +</li> +</ul> +<p><strong>16:00 coffee</strong></p> +<ul> +<li> +<p>16:20 Ralph Hodgson (TopQuadrant) – [pdf](https://pu b-3834 10a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachment s/5538064/5506367.pdf), <a href="https://www.youtube.com/watch?v=ZUDnVw9P_Rc">video</a>:<em>Customer experiences in implementing SKOS-based vocabularymanagement systems</em></p> +</li> +<li> +<p>16:40 Simon Jupp (European Bioinformatics Institute) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506368.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=CgTuOGK92W8">video</a>: <em>[Delivering RDF for the life science at the European Bioinformatics Institute: Six months in.]</em></p> +</li> +<li> +<p>17:00 Jerven Bolleman (Swiss Institute of Bioinformatics) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506381.pdf">pdf</a>, <a href="https://www.youtube.com/watch?v=QTc3yOgoEsg">video</a>: <em>Breakmarking UniProt RDF. SPARQL queries that make your database cry&hellip;</em></p> +</li> +<li> +<p>17:20 Rein van &rsquo;t Veer (Digital Heritage Netherlands) – <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506380.pptx">pptx</a>, <a href="https://www.youtube.com/watch?v=2vDrZoskGyQ">video</a> <em>Time and space for heritage</em></p> +</li> +<li> +<p>17:40 <strong>end of meeting</strong></p> +</li> +<li> +<p>19:00 - 21:30 Social Dinner in restaurant Boom</p> +</li> +</ul> +<p><strong>April 4th</strong></p> +<p>LDBC plenary meeting for project partners.</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5506362.ppt">Benchmarking Graph-Processing Platforms: A Vision</a> – Alexandru Iosup</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p>The meeting will be held at the Dutch national research institute for computer science and mathematics (<a href="http://www.cwi.nl">CWI</a> - Centrum voor Wiskunde en Informatica). It is located at <a href="http://www.amsterdamsciencepark.nl/">Amsterdam Science Park</a>:</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505821.jpg" alt=""></p> +<p>(<a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/fourth-tuc-meeting/attachments/5538064/5505820.pdf">A5 map</a>)</p> +<h6 id="travel">Travel</h6> +<p><strong>Arriving &amp; departing:</strong></p> +<p>Amsterdam has a well-functioning and nearby airport called Schiphol (AMS, <a href="http://www.schiphol.com/">www.schiphol.nl</a>) that serves all main European carriers and also very many low-fare carriers.</p> +<p><a href="http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane">http://www.iamsterdam.com/en/visiting/touristinformation/gettingaround/arrival-and-departure/arrival-by-plane</a></p> +<p><strong>Trains</strong> (~5 per hour) are the most convenient means of transport between Schiphol airport and Amsterdam city center, the Centraal Station (17 minutes, a train every 15 minutes) &ndash; which station you are also likely arriving at in case of an international train trip.</p> +<p>From the Centraal Station in Amsterdam, there is a direct train (every half an hour, runs 11 minutes) to the Science Park station, which is walking distance of CWI. If you go from the Centraal Station to one of the hotels, you should take tram 9 &ndash; it starts at Centraal Station (exception: for Hotel Casa 400, you should take the metro to Amstel station - any of the metros will do).</p> +<p><strong>Taxi</strong> is an alternative, though expensive. The price from Schiphol will be around 45 EUR to the CWI or another point in the city center (depending on traffic, the ride is 20-30 minutes).</p> +<p><strong>Public transportation</strong> (tram, bus, metro) tickets for a single ride and 1-day (24 hour) passes can be purchased from the driver/conductor on trams and buses (cash only) and from vending machines in the metro stations.</p> +<p><strong>Only the &ldquo;disposable&rdquo; cards are interesting for you as visitor.</strong></p> +<p>Multi-day (up to 7-days/168 hours) passes can only be purchased from the vending machines or from the ticket office opposite of Centraal Station.</p> +<p><strong>Getting Around:</strong> the fastest way to move in the city of Amsterdam generally is by bicycle. Consider renting such a device at your hotel. For getting from your hotel to the CWI, you can either take a taxi (expensive), have a long walk (35min), use public transportation (for NH Tropen/The Manor take bus 40 from Muiderpoort Station, for Hotel Casa 400 same bus 40 but from Amstel station, and for the Rembrandt Hotel it is tram 9 until Middenweg/Kruislaan and then bus 40), or indeed bike for 12 minutes.</p> +<p><strong>Cars</strong></p> +<p>In case you plan to arrive by car, please be aware that parking space in Amsterdam is scarce and hence very expensive. But, you can park your car on the &ldquo;WCW&rdquo; terrain where CWI is located. To enter the terrain by car, you have to get a ticket from the machine at the gate. To leave the terrain, again, you can get an exit ticket from the CWI reception.</p> +<p><strong>Arriving at CWI:</strong> Once you arrive at CWI, you need to meet the reception, and tell them that you are attending the LDBC TUC meeting. Then, you&rsquo;ll receive a visitor&rsquo;s pass that allows you to enter our building.</p> +<p><strong>Social Dinner</strong></p> +<p>The social dinner will take place at 7pm on April 3 in Restaurant Boom (<a href="http://www.boometenendrinken.nl/">boometenendrinken.nl</a>), Linneausstraat 63, Amsterdam.</p> + + + + + Third TUC Meeting + https://ldbcouncil.org/event/third-tuc-meeting/ + Tue, 19 Nov 2013 08:00:00 +0000 + + https://ldbcouncil.org/event/third-tuc-meeting/ + <p>The LDBC consortium is pleased to announce the third Technical User Community (TUC) meeting!</p> +<p>This will be a one day event in London on the <strong>19 November 2013</strong> running in collaboration with the <a href="http://www.graphconnect.com/london/">GraphConnect</a> event (18/19 November). Registered TUC participants that would like a free pass to all of GraphConnect should register for GraphConnect using this following coupon code: <strong>LDBCTUC</strong>.</p> +<p>The TUC event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology</li> +<li>Industry discussions on the contents of the benchmarks</li> +</ul> +<p>We will also be launching the LDBC non-profit organization, so anyone outside the EU project will be able to join as a member.</p> +<p>We will kick off new benchmark development task forces in the coming year, and talks at this coming TUC will play an important role in deciding the use case scenarios that will drive those benchmarks.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a></li> +<li><a href="#ldbctuc-background">LDBC/TUC Background</a> +<ul> +<li><a href="#social-network-benchmark">Social Network Benchmark</a></li> +<li><a href="#semantic-publishing-benchmark">Semantic Publishing Benchmark</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>November 19th - Public TUC Meeting</strong></p> +<p>8:00 Breakfast and registration will open for Graph Connect/TUC at 8:00 am (Dexter House)</p> +<p>short LDBC presentation (Peter Boncz) during GraphConnect keynote by Emil Eifrem (09:00-09:30 Dexter House)</p> +<p>NOTE: the TUC meeting is at the Tower Hotel, nearby Dexter House.</p> +<p>10:00 TUC Meeting Opening (Peter Boncz)</p> +<p>10:10 TUC Presentations (RDF Application Descriptions)</p> +<ul> +<li>Johan Hjerling (BBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275669.pdf">BBC Linked Data and the Semantic Publishing Benchmark</a></strong></em></li> +<li>Andreas Both (Unister): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505027.pdf">Ontology-driven applications in an e-commerce context</a></strong></em></li> +<li>Nuno Carvalho (Fujitsu Laboratories Europe): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275666.pdf"><em><strong>Fujitsu RDF use cases and benchmarking requirements</strong></em></a></li> +<li>Robina Clayphan (Europeana): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816977.ppt">Europeana and Open Data</a></strong></em></li> +</ul> +<p>11:30 Semantic Publishing Benchmark (SPB)</p> +<ul> +<li>Venelin Kotsev (Ontotext - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">Semantic Publishing Benchmark Task Force Update</a></strong></em> and <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">report</a></strong></em></li> +</ul> +<p>12:00-13:00 Lunch at the Graph Connect venue</p> +<p><em>Talks During Lunch:</em></p> +<ul> +<li>Pedro Furtado, Jorge Bernardino (Univ. Coimbra): <strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275671.pdf">KEYSTONE Cost Action</a></strong></li> +</ul> +<p>13:00 TUC Presentations (Graph Application Descriptions)</p> +<ul> +<li>Minqi Zhou / Weining Qian (East China Normal University): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275670.pdf">Elastic and realistic social media data generation</a></strong></em></li> +<li>Andrew Sherlock (Shapespace): <em><strong>Shapespace Use Case</strong></em></li> +<li>Sebastian Verheughe (Telenor): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275667.pdf">Real-time Resource Authorization</a></strong></em></li> +</ul> +<p>14:00 Social Network Benchmark (SNB)</p> +<ul> +<li>Norbert Martinez (UPC - LDBC): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505025.pdf">Social Network Benchmark Task Force Update</a></strong></em> and <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816975.pdf">Report</a></li> +</ul> +<p><em>14:30 Break</em></p> +<p>14:45 TUC Presentations (Graph Analytics)</p> +<ul> +<li>Keith Houck (IBM): <em><strong>Benchmarking experiences with [System G Native Store (tentative title)]</strong></em></li> +<li>Abraham Bernstein (University of Zurich): <em><strong>Streams and Advanced Processing: Benchmarking RDF querying beyond the Standard SPARQL Triple Store</strong></em></li> +<li>Luis Ceze (University of Washington): <em><strong>Grappa and GraphBench Status Update</strong></em></li> +</ul> +<p><em>15:45 Break</em></p> +<p>16:00 TUC Presentations* (Possible Future RDF Benchmarking Topics)*</p> +<ul> +<li>Christian-Emil Ore (Unit for Digital Documentation, University of Oslo, Norway): <em><strong>CIDOC-CRM</strong></em></li> +<li>Atanas Kiryakov (Ontotext): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275672.pdf">Large-scale Reasoning with a Complex Cultural Heritage Ontology (CIDOC CRM)</a></strong></em></li> +<li>Kostis Kyzirakos (National and Kapodistrian University of Athens / CWI): <em><strong><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5275668.pdf">Geographica: A Benchmark for Geospatial RDF Stores</a></strong></em></li> +<li>Xavier Lopez (Oracle): <em><strong>W3C Property Graph progress</strong></em></li> +<li>Thomas Scharrenbach (University Zurich) <em><strong>PCKS: Benchmarking Semantic Flow Processing Systems</strong></em></li> +</ul> +<p>17:20 Meeting Conclusion (Josep Larriba Pey)</p> +<p>17:30 End of TUC meeting</p> +<p>19:00 Social dinner</p> +<p><strong>November 20th - Internal LDBC Meeting</strong></p> +<p>10:00 Start</p> +<p>12:30 <em>End of meeting</em></p> +<ul> +<li>coffee and lunch provided</li> +</ul> +<h3 id="logistics">Logistics</h3> +<p><strong>Date</strong></p> +<p>19th November 2013</p> +<p><strong>Location</strong></p> +<p>The TUC meeting will be held in <strong>The Tower</strong> hotel (<a href="http://goo.gl/qZt8Fz">Google Maps link</a>) approximately 4 minutes walk from the <a href="http://www.graphconnect.com/london/">GraphConnect</a> conference in London.</p> +<p>Getting there</p> +<ul> +<li>From City Airport is the easiest: short ride on the DLR to Tower Gateway. Easy.</li> +<li>From London Heathrow: first need to take the Heathrow Express to Paddington. Then take the Circle line to Tower Hill. <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554995.pdf">See attached</a>.</li> +</ul> +<h3 id="ldbctuc-background">LDBC/TUC Background</h3> +<p>Looking back, we have been working on two benchmarks for the past year: a Social Network Benchmark (SNB) and a Semantic Publishing Benchmark (SPB). While below we provide a short summary, all the details of the work on these benchmark development efforts can be found in the first yearly progress reports:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SNB_Report_Nov2013.pdf</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4816974.pdf">LDBC_SPB_Report_Nov2013.pdf</a></li> +</ul> +<p>A summary of these efforts can be read below or, for a more detailed account, please refer to: <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/4554967.pdf">The Linked Data Benchmark Council: a Graph and RDF industry benchmarking effort</a>. Annual reports about the progress, results, and future work of these two efforts will soon be available for download here, and will be discussed in depth at the TUC.</p> +<h4 id="social-network-benchmark">Social Network Benchmark</h4> +<p>The Social Network Benchmark (SNB) is designed for evaluating a broad range of technologies for tackling graph data management workloads. The systems targeted are quite broad: from graph, RDF, and relational database systems to Pregel-like graph compute frameworks. The social network scenario was chosen with the following goals in mind:</p> +<ul> +<li>it should be understandable, and the relevance of managing such data should be understandable</li> +<li>it should cover the complete range of interesting challenges, according to the benchmark scope</li> +<li>the queries should be realistic, i.e., similar data and workloads are encountered in practice</li> +</ul> +<p>SNB includes a data generator for creation of synthetic social network data with the following characteristics:</p> +<ul> +<li>data schema is representative of real social networks</li> +<li>data generated includes properties occurring in real data, e.g. irregular structure, structure/value correlations, power-law distributions</li> +<li>the software generator is easy-to-use, configurable and scalable</li> +</ul> +<p>SNB is intended to cover a broad range of aspects of social network data management, and therefore includes three distinct workloads:</p> +<ul> +<li><strong>Interactive</strong> +<ul> +<li>Tests system throughput with relatively simple queries and concurrent updates, it is designed to test ACID features and scalability in an online operational setting.</li> +<li>The targeted systems are expected to be those that offer transactional functionality.</li> +</ul> +</li> +<li><strong>Business Intelligence</strong> +<ul> +<li>Consists of complex structured queries for analyzing online behavior of users for marketing purposes, it is designed to stress query execution and optimization.</li> +<li>The targeted systems are expected to be those that offer an abstract query language.</li> +</ul> +</li> +<li><strong>Graph Analytics</strong> +<ul> +<li>Tests the functionality and scalability of systems for graph analytics, which typically cannot be expressed in a query language.</li> +<li>Analytics is performed on most/all of the data in the graph as a single operation and produces large intermediate results, and it is not not expected to be transactional or need isolation.</li> +<li>The targeted systems are graph compute frameworks though database systems may compete, for example by using iterative implementations that repeatedly execute queries and keep intermediate results in temporary data structures.</li> +</ul> +</li> +</ul> +<h4 id="semantic-publishing-benchmark">Semantic Publishing Benchmark</h4> +<p>The Semantic Publishing Benchmark (SPB) simulates the management and consumption of RDF metadata that describes media assets, or creative works.</p> +<p>The scenario is a media organization that maintains RDF descriptions of its catalogue of creative works &ndash; input was provided by actual media organizations which make heavy use of RDF, including the BBC. The benchmark is designed to reflect a scenario where a large number of aggregation agents provide the heavy query workload, while at the same time a steady stream of creative work description management operations are in progress. This benchmark only targets RDF databases, which support at least basic forms of semantic inference. A tagging ontology is used to connect individual creative work descriptions to instances from reference datasets, e.g. sports, geographical, or political information. The data used will fall under the following categories: reference data, which is a combination of several Linked Open Data datasets, e.g. GeoNames and DBpedia; domain ontologies, that are specialist ontologies used to describe certain areas of expertise of the publishing, e.g., sport and education; publication asset ontologies, that describe the structure and form of the assets that are published, e.g., news stories, photos, video, audio, etc.; and tagging ontologies and the metadata, that links assets with reference/domain ontologies.</p> +<p>The data generator is initialized by using several ontologies and datasets. The instance data collected from these datasets are then used at several points during the execution of the benchmark. Data generation is performed by generating SPARQL fragments for create operations on creative works and executing them against the RDF database system.</p> +<p>Two separate workloads are modeled in SPB:</p> +<ul> +<li><strong>Editorial:</strong> Simulates creating, updating and deleting creative work metadata descriptions. Media companies use both manual and semi-automated processes for efficiently and correctly managing asset descriptions, as well as annotating them with relevant instances from reference ontologies.</li> +<li><strong>Aggregation:</strong> Simulates the dynamic aggregation of content for consumption by the distribution pipelines (e.g. a web-site). The publishing activity is described as &ldquo;dynamic&rdquo;, because the content is not manually selected and arranged on, say, a web page. Instead, templates for pages are defined and the content is selected when a consumer accesses the page.</li> +</ul> +<p><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/third-tuc-meeting/attachments/4325436/5505026.pdf">Status of the Semantic Publishing Benchmark</a></p> + + + + + Second TUC Meeting + https://ldbcouncil.org/event/second-tuc-meeting/ + Mon, 22 Apr 2013 10:00:00 +0000 + + https://ldbcouncil.org/event/second-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the second Technical User Community (TUC) meeting.</p> +<p>This will be a two day event in Munich on the <strong>22/23rd April 2013</strong>.</p> +<p>The event will include:</p> +<ul> +<li>Introduction to the objectives and progress of the LDBC project.</li> +<li>Description of the progress of the benchmarks being evolved through Task Forces.</li> +<li>Users explaining their use-cases and describing the limitations they have found in current technology.</li> +<li>Industry discussions on the contents of the benchmarks.</li> +</ul> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +<li><a href="#venue">Venue</a> +<ul> +<li><a href="#getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</a></li> +<li><a href="#getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</a></li> +<li><a href="#getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</a></li> +</ul> +</li> +<li><a href="#getting-there">Getting there</a></li> +<li><a href="#social-dinner">Social Dinner</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p><strong>April 22nd</strong></p> +<p>10:00 <em>Registration.</em><br> +10:30 Josep Lluis Larriba Pey (UPC) - <em>Welcome and Introduction.</em><br> +10:30 Peter Boncz (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687373.pptx">LDBC: goals and status</a></p> +<p><em>Social Network Use Cases (with discussion moderated by Josep Lluis Larriba Pey)</em></p> +<p>11:00 Josep Lluis Larriba Pey (UPC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687372.pdf">Social Network Benchmark Task Force</a><br> +11:30 Gustavo González (Mediapro): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687367.pdf">Graph-based User Modeling through Real-time Social Streams</a><br> +12:00 Klaus Großmann (Dshini): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687365.pdf">Neo4j at Dshini</a></p> +<p>12:30 Lunch</p> +<p><em>Semantic Publishing Use Cases (with discussion moderated by Barry Bishop)</em></p> +<p>13:30 Barry Bishop (Ontotext): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687366.pptx">Semantic Publishing Benchmark Task Force</a><br> +14:00 Dave Rogers (BBC): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687364.pptx">Linked Data Platform at the BBC</a><br> +14:30 Edward Thomas (Wolters Kluwer): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687374.pdf">Semantic Publishing at Wolters Kluwer</a></p> +<p>15:00 Coffee break</p> +<p><em>Projects Related to LDBC</em></p> +<p>15:30 Fabian Suchanek (MPI): &ldquo;YAGO: A large knowledge base from Wikipedia and WordNet&rdquo;<br> +16:00 Antonis Loziou (VUA): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687375.pptx">The OpenPHACTS approach to data integration</a><br> +16:30 Mirko Kämpf (Brox): &ldquo;GeoKnow - Spatial Data Web project and Supply Chain Use Case&rdquo;</p> +<p>17:00 <em>End of first day</em></p> +<p>19:00 Social dinner</p> +<p><strong>April 23rd</strong></p> +<p><em>Industry &amp; Hardware Aspects</em></p> +<p>10:00 Xavier Lopez (Oracle): <a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687384.pdf">Graph Database Performance an Oracle Perspective.pdf</a><br> +10:30 Pedro Trancoso (University of Cyprus): &ldquo;Benchmarking and computer architecture: the research side&rdquo;</p> +<p>11:00 Coffee break</p> +<p><em>Future Steps and TUC feedback session</em></p> +<p>11:30 Peter Boncz (VUA) moderates: next steps in the Social Networking Task Force<br> +12:00 Barry Bishop (Ontotext) moderates: next steps in the Semantic Publishing Task Force&quot;</p> +<p>12:30 <em>End of meeting</em></p> +<h3 id="logistics">Logistics</h3> +<h4 id="date">Date</h4> +<p>22nd and 23th April 2013</p> +<h4 id="location">Location</h4> +<p>The TUC meeting will be held at LE009 room at LRZ (Leibniz-Rechenzentrum) located inside the TU Munich campus in Garching, Germany. The address is:</p> +<p>LRZ (Leibniz-Rechenzentrum)<br> +Boltzmannstraße 1<br> +85748 Garching, Germany</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi and Subway <a href="http://www.in.tum.de/fileadmin/user_upload/Sonstiges/anfahrt_garching.pdf">Ubahn</a></p> +<h5 id="getting-to-the-tum-campus-from-the-munich-city-center-subway-u-bahn">Getting to the TUM Campus from the Munich city center: Subway (U-Bahn)</h5> +<p>Take the U-bahn line U6 in the direction of Garching-Forschungszentrum, exit at the end station. Take the south exit to MI-Building and LRZ on the Garching Campus. The time of the journey from the city center is approx. 25-30 minutes. In order to get here from the City Center, you need the Munich XXL ticket that costs around 7.50 euros and covers all types of transportation for one day. The ticket has to be validated before ride.</p> +<h5 id="getting-to-the-tum-campus-from-the-munich-airport">Getting to the TUM Campus from the Munich Airport</h5> +<ol> +<li> +<p>(except weekends) S-Bahn S8 line in the direction of (Hauptbahnhof) Munich Central Station until the third stop, Ismaning (approx. 13 minutes). From here Bus Nr. 230 until stop MI-Building on the Garching Campus. Alternatively: S1 line until Neufahrn, then with the Bus 690, which stops at Boltzmannstraße.</p> +</li> +<li> +<p>S-Bahn lines S8 or S1 towards City Center until Marienplatz stop. Then change to U-bahn U6 line towards Garching-Forschungszentrum, exit at the last station. Take the south exit to MI-Building and LRZ.</p> +</li> +<li> +<p>Taxi: fare is ca. 30-40 euros.</p> +</li> +</ol> +<p>For cases 1 and 2, before the trip get the One-day Munich Airport ticket and validate it. It will cover all public transportation for that day.</p> +<h5 id="getting-to-the-tum-campus-from-garching-u-bahn">Getting to the TUM Campus from Garching: U-Bahn</h5> +<p>The city of Garching is located on the U6 line, one stop before the Garching-Forschungszentrum. In order to get from Garching to Garching-Forschungszentrum with the U-bahn, a special one-way ticket called Kurzstrecke (1.30 euros) can be purchased.</p> +<p><strong>Finding LRZ@TUM</strong></p> +<p><a href="http://www.openstreetmap.org/?mlat=48.2615702464&amp;mlon=11.6686558264&amp;zoom=32">OpenStreetMap link</a></p> +<p><a href="https://maps.google.com/maps?q=48.2615702464,11.6686558264&amp;spn=0.005,0.005&amp;t=k">Google Maps link</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687268.gif" alt=""></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/second-tuc-meeting/attachments/2523698/2687269.gif" alt=""></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying: Munich</strong> airport is located 28.5 km northeast of Munich. There are two ways to get from the airport to the city center: suburban train (S-bahn) and Taxi.</p> +<p><strong>S-Bahn:</strong> S-bahn lines S1 and S8 will get you from the Munich airport to the city center, stopping at both Munich Central Station (Hauptbahnhof) and Marienplatz. One-day Airport-City ticket costs 11.20 euros and is valid for the entire Munich area public transportation during the day of purchase (the tickets needs to be validated before the journey). S-bahn leaves every 5-20 minutes and reaches the city center in approx. 40 minutes.</p> +<p><strong>Taxi:</strong> taxi from the airport to the city center costs approximately 50 euros</p> +<h4 id="social-dinner">Social Dinner</h4> +<p>The social dinner will take place at 7 pm on April 22 in Hofbräuhaus (second floor)</p> +<p>Address: Hofbräuhaus, Platzl 9, Munich</p> + + + + + First TUC Meeting + https://ldbcouncil.org/event/first-tuc-meeting/ + Mon, 19 Nov 2012 09:00:00 +0100 + + https://ldbcouncil.org/event/first-tuc-meeting/ + <p>The LDBC consortium are pleased to announce the first Technical User Community (TUC) meeting. This will be a two day event in Barcelona on the <strong>19/20th November 2012</strong>.</p> +<p>So far more than six commercial consumers of graph/RDF database technology have expressed an interest in attending the event and more are welcome. The proposed format of the event wil include:</p> +<ul> +<li>Introduction by the coordinator and technical director explaining the objectives of the LDBC project</li> +<li>Invitation to users to explain their use-cases and describe the limitations they have found in current technology</li> +<li>Brain-storming session for identifying trends and mapping out strategies to tackle existing choke-points</li> +</ul> +<p>The exact agenda will be published here as things get finalised before the event.</p> +<p>All users of RDF and graph databases are welcome to attend. If you are interested, please contact: ldbc AT ac DOT upc DOT edu</p> +<ul> +<li><a href="#agenda">Agenda</a></li> +<li><a href="#slide">Slide</a> +<ul> +<li><a href="#logistics">Logistics</a> +<ul> +<li><a href="#date">Date</a></li> +<li><a href="#location">Location</a></li> +</ul> +</li> +<li><a href="#venue">Venue</a></li> +<li><a href="#getting-there">Getting there</a></li> +</ul> +</li> +</ul> +<h3 id="agenda">Agenda</h3> +<p>We will start at 9:00 on Monday for a full day, followed by a half a day on Tuesday to allow attendees to travel home on the evening of the 20th.</p> +<p><strong>Day 1</strong></p> +<p>09:00 Welcome (Location: Aula Master)<br> +09:30 Project overview (Emphasis on task forces?) + Questionnaire results?<br> +10:30 Coffee break<br> +11:00 User talks (To gather information for use cases?)</p> +<p>13:00 Lunch</p> +<p>14:00 User talks (cont.)<br> +15:00 Use case discussions (based on questionnaire results + consortium proposal + user talks).<br> +16:00 Task force proposals (consortium)<br> +17:00 Finish first day</p> +<p>20:00 Social dinner</p> +<p><strong>Day 2</strong></p> +<p>10:00 Task force discussion (consortium + TUC)<br> +11:00 Coffe break<br> +11:30 Task force discussion (consortium + TUC)<br> +12:30 Summaries (Task forces, use cases, &hellip;) and actions</p> +<p>13:00 Lunch and farewell</p> +<p>15:00 LDBC Internal meeting</p> +<h3 id="slide">Slide</h3> +<p>Opening session:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686995.pptx">CWI – Peter Boncz</a> – Objectives</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687001.pdf">UPC – Larri</a> – Questionnaire</li> +</ul> +<p>User stories:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686998.pdf">BBC – Jem Rayfield</a></li> +<li>CA Technologies – Victor Muntés</li> +<li>Connected Discovery (Open Phacts) – Bryn Williams-Jones</li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687003.pptx">Elsevier – Alan Yagoda</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687000.pptx">ERA7 Bioinformatics – Eduardo Pareja</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687005.pptx">Press Association – Jarred McGinnis</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687004.pptx">RJLee – David Neuer</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686994.pdf">Yale – Lec Maj</a></li> +</ul> +<p>Benchmark proposals:</p> +<ul> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2686991.pdf">Publishing benchmark proposal – Ontotext – Barry Bishop</a></li> +<li><a href="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/2687002.pdf">Social Network Benchmark Proposal – UPC – Larri</a></li> +</ul> +<h4 id="logistics">Logistics</h4> +<h5 id="date">Date</h5> +<p>19th and 20th November 2012</p> +<h5 id="location">Location</h5> +<p>The TUC meeting will be held at “Aula Master” at A3 building located inside the “Campus Nord de la UPC” in Barcelona. The address is:</p> +<p>Aula Master<br> +Edifici A3, Campus Nord UPC<br> +C. Jordi Girona, 1-3<br> +08034 Barcelona, Spain</p> +<h4 id="venue">Venue</h4> +<p>To reach the campus, there are several options, including Taxi, <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=c8996f6c-8ad5-4d21-b59b-faf9fceebd80&amp;groupId=10168">Metro</a> and <a href="http://www.tmb.cat/ca/c/document_library/get_file?uuid=5e6af5e2-7677-4ce8-85bb-8e63f2b086f1&amp;groupId=10168">Bus</a>.</p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933315.jpg" alt=""></p> +<p><strong>Finding UPC</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933318.jpg" alt=""></p> +<p><strong>Finding the meeting room</strong></p> +<h4 id="getting-there">Getting there</h4> +<p><strong>Flying:</strong> Barcelona airport is situated 12 km from the city. There are several ways of getting from the airport to the centre of Barcelona, the cheapest of which is to take the train located outside just a few minutes walking distance past the parking lots at terminal 2 (there is a free bus between terminal 1 and terminal 2, see this <a href="http://goo.gl/maps/iJqlj">map of the airport</a>). It is possible to buy 10 packs of train tickets which makes it cheaper. Taking the bus to the centre of town is more convenient as they leave directly from terminal 1 and 2, however it is more expensive than the train.</p> +<p><strong>Rail:</strong> The Renfe commuter train leaves the airport every 30 minutes from 6.13 a.m. to 11.40 p.m. Tickets cost around 3€ and the journey to the centre of Barcelona (Sants or Plaça Catalunya stations) takes 20 minutes.</p> +<p><strong>Bus:</strong> The Aerobus leaves the airport every 12 minutes, from 6.00 a.m. to 24.00, Monday to Friday, and from 6.30 a.m. to 24.00 on Saturdays, Sundays and public holidays. Tickets cost 6€ and the journey ends in Plaça Catalunya in the centre of Barcelona.</p> +<p><strong>Taxi:</strong> From the airport, you can take one of Barcelona&rsquo;s typical black and yellow taxis. Taxis may not take more than four passengers. Unoccupied taxis display a green light and have a clearly visible sign showing LIBRE or LLIURE. The trip to Sants train station costs approximately €16 and trips to other destinations in the city cost approximately €18.</p> +<p><strong>Train and bus:</strong> Barcelona has two international train stations: Sants and França. Bus companies have different points of arrival in different parts of the city. You can find detailed information in the following link: <a href="http://www.barcelona-airport.com/eng/transport_eng.htm">http://www.barcelona-airport.com/eng/transport_eng.htm</a></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933316.jpg" alt=""></p> +<p><strong>The locations of the airport and the city centre</strong></p> +<p><img src="https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/event/first-tuc-meeting/attachments/1671180/1933317.jpg" alt=""></p> +<p><strong>Bus map</strong></p> + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/1/index.html b/jan-2-2006/page/1/index.html new file mode 100644 index 00000000..718d713b --- /dev/null +++ b/jan-2-2006/page/1/index.html @@ -0,0 +1,10 @@ + + + + https://ldbcouncil.org/jan-2-2006/ + + + + + + diff --git a/jan-2-2006/page/2/index.html b/jan-2-2006/page/2/index.html new file mode 100644 index 00000000..d49f0601 --- /dev/null +++ b/jan-2-2006/page/2/index.html @@ -0,0 +1,804 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + + + + +
+
+
+ +

LDBC and Apache Flink

+
Tags:
+ FLINK + , DATAGEN + , SNB + +
+
+ +

Apache Flink [1] is an open source platform for distributed stream and batch data processing. Flink’s core is a streaming dataflow engine that provides data distribution, communication, and fault tolerance for distributed computations over data streams. Flink also builds batch processing on top of the streaming engine, overlaying native iteration support, managed memory, and program optimization.

+

+

Flink offers multiple APIs to process data …

+ +
+
+ +
+ + +
+
+
+ +

Elements of Instance Matching Benchmarks: a Short Overview

+
Tags:
+ INSTANCE MATCHING + , SPB + +
+
+ +

The number of datasets published in the Web of Data as part of the Linked Data Cloud is constantly increasing. The Linked Data paradigm is based on the unconstrained publication of information by different publishers, and the interlinking of web resources through “same-as” links which specify that two URIs correspond to the same real world object. In the vast number of data sources participating in the Linked Data Cloud, this information is not …

+ +
+
+ +
+ + +
+
+ +
+ +

In this post we will look at running the LDBC SNB on Virtuoso.

+

First, let’s recap what the benchmark is about:

+
    +
  1. +

    fairly frequent short updates, with no update contention worth mentioning

    +
  2. +
  3. +

    short random lookups

    +
  4. +
  5. +

    medium complex queries centered around a person’s social environment

    +
  6. +
+

The updates exist so as to invalidate strategies that rely too heavily on precomputation. The short lookups exist for the sake of realism; after all, an …

+ +
+
+ +
+ + +
+
+
+ +

SNB and Graphs Related Presentations at GRADES '15

+
Tags:
+ SIGMOD + , GRAPHALYTICS + , GRADES + , SNB + , DATAGEN + , WORKSHOP + +
+
+ +

Next 31st of May the GRADES workshop will take place in Melbourne within the ACM/SIGMOD presentation. GRADES started as an initiative of the Linked Data Benchmark Council in the SIGMOD/PODS 2013 held in New York.

+

Among the papers published in this edition we have “Graphalytics: A Big Data Benchmark for Graph-Processing Platforms”, which presents a new benchmark that uses the Social Network Benchmark data generator of LDBC (that can …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 2: Modeling Choices

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

​SNB Interactive is the wild frontier, with very few rules. This is necessary, among other reasons, because there is no standard property graph data model, and because the contestants support a broad mix of programming models, ranging from in-process APIs to declarative query.

+

In the case of Virtuoso, we have played with SQL and SPARQL implementations. For a fixed schema and well known workload, SQL will always win. The reason for this is that …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/3/index.html b/jan-2-2006/page/3/index.html new file mode 100644 index 00000000..48709850 --- /dev/null +++ b/jan-2-2006/page/3/index.html @@ -0,0 +1,791 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

LDBC Participates in the 36th Edition of the ACM SIGMOD/PODS Conference

+
Tags:
+ SIGMOD + , GRADES + , SNB + , GRAPHALYTICS + , WORKSHOP + +
+
+ +

LDBC is presenting two papers at the next edition of the ACM SIGMOD/PODS conference held in Melbourne from May 31st to June 4th, 2015. The annual ACM SIGMOD/PODS conference is a leading international forum for database researchers, practitioners, developers, and users to explore cutting-edge ideas and results, and to exchange techniques, tools and experiences.

+

On the industry track, LDBC will be presenting the Social Network Benchmark Interactive …

+ +
+
+ +
+ + +
+
+
+ +

SNB Interactive Part 1: What Is SNB Interactive Really About?

+
Tags:
+ SNB + , VIRTUOSO + , INTERACTIVE + +
+
+ +

This post is the first in a series of blogs analyzing the LDBC Social Network Benchmark Interactive workload. This is written from the dual perspective of participating in the benchmark design and of building the OpenLink Virtuoso implementation of same.

+

With two implementations of SNB interactive at four different scales, we can take a first look at what the benchmark is really about. The hallmark of a benchmark implementation is that its …

+ +
+
+ +
+ + +
+
+
+ +

Why Do We Need an LDBC SNB-Specific Workload Driver?

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

In a previous 3-part blog series we touched upon the difficulties of executing the LDBC SNB Interactive (SNB) workload, while achieving good performance and scalability. What we didn’t discuss is why these difficulties were unique to SNB, and what aspects of the way we perform workload execution are scientific contributions - novel solutions to previously unsolved problems. This post will highlight the differences between SNB and more …

+ +
+
+ +
+ + +
+
+
+ +

Event Driven Post Generation in Datagen

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As discussed in previous posts, one of the features that makes Datagen more realistic is the fact that the activity volume of the simulated Persons is not uniform, but forms spikes. In this blog entry I want to explain more in depth how this is actually implemented inside of the generator.

+

First of all, I start with a few basics of how Datagen works internally. In Datagen, once the person graph has been created (persons and their relationships), …

+ +
+
+ +
+ + +
+
+
+ +

The LDBC Datagen Community Structure

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

This blog entry is about one of the features of DATAGEN that makes it different from other synthetic graph generators that can be found in the literature: the community structure of the graph.

+

When generating synthetic graphs, one must not only pay attention to quantitative measures such as the number of nodes and edges, but also to other more qualitative characteristics such as the degree distribution, clustering coefficient. Real graphs, and …

+ +
+
+ +
+ + +
+
+
+ +

Industry Relevance of the Semantic Publishing Benchmark

+
Tags:
+ INDUSTRY + , SPB + +
+
+ + + post/industry-relevance-of-the-semantic-publishing-benchmark/01_sf_newspapers.png +
+ +
+ +

Publishing and media businesses are going through transformation

+

I took this picture in June 2010 next to Union Square in San Francisco. I was smoking and …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/4/index.html b/jan-2-2006/page/4/index.html new file mode 100644 index 00000000..ac8be62f --- /dev/null +++ b/jan-2-2006/page/4/index.html @@ -0,0 +1,757 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

OWL-Empowered SPARQL Query Optimization

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data paradigm has become the prominent enabler for sharing huge volumes of data using Semantic Web technologies, and has created novel challenges for non-relational data management systems, such as RDF and graph engines. Efficient data access through queries is perhaps the most important data management task, and is enabled through query optimization techniques, which amount to the discovery of optimal or close to optimal execution …

+ +
+
+ +
+ + +
+
+
+ +

Person Activity Subgraph Features in LDBC DATAGEN

+
Tags:
+ SNB + , DATAGEN + +
+
+ +

When talking about DATAGEN and other graph generators with social network characteristics, our attention is typically borrowed by the friendship subgraph and/or its structure. However, a social graph is more than a bunch of people being connected by friendship relations, but has a lot more of other things is worth to look at. With a quick view to commercial social networks like Facebook, Twitter or Google+, one can easily identify a lot of other …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 2: Tracking Dependencies Between Queries

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

The SNB Driver part 1 post introduced, broadly, the challenges faced when developing a workload driver for the LDBC SNB benchmark. In this blog we’ll drill down deeper into the details of what it means to execute “dependent queries” during benchmark execution, and how this is handled in the driver. First of all, as many driver-specific terms will be used, below is a listing of their definitions. There is no need to read them in …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 3: Workload Execution Putting It All Together

+
Tags:
+ SNB + , DRIVER + , INTERACTIVE + +
+
+ +

Up until now we have introduced the challenges faced when executing the LDBC SNB benchmark, as well as explained how some of these are overcome. With the foundations laid, we can now explain precisely how operations are executed.

+

Based on the dependencies certain operations have, and on the granularity of parallelism we wish to achieve while executing them, we assign a Dependency Mode and an Execution Mode to every operation type. Using these …

+ +
+
+ +
+ + +
+
+
+ +

Running the Semantic Publishing Benchmark on Sesame, a Step by Step Guide

+
Tags:
+ SPB + , SESAME + , RDF + , TUTORIAL + , GUIDE + +
+
+ +

Until now we have discussed several aspects of the Semantic Publishing Benchmark (SPB) such as the difference in performance between virtual and real servers configuration, how to choose an appropriate query mix for a benchmark run and our experience with using SPB in the development process of GraphDB for finding performance issues.

+

In this post we provide a step-by-step guide on how to run SPB using the Sesame RDF data store on a fresh install …

+ +
+
+ +
+ + +
+
+
+ +

Semantic Publishing Instance Matching Benchmark

+
Tags:
+ INSTANCE MATCHING + , BENCHMARK + +
+
+ +

The Semantic Publishing Instance Matching Benchmark (SPIMBench) is a novel benchmark for the assessment of instance matching techniques for RDF data with an associated schema. SPIMBench extends the state-of-the art instance matching benchmarks for RDF data in three main aspects: it allows for systematic scalability testing, supports a wider range of test cases including semantics-aware ones, and provides an enriched gold standard.

+

The SPIMBench …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/5/index.html b/jan-2-2006/page/5/index.html new file mode 100644 index 00000000..d97df910 --- /dev/null +++ b/jan-2-2006/page/5/index.html @@ -0,0 +1,757 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Further Developments in SNB BI Workload

+
Tags:
+ SNB + , BI + +
+
+ +

We are presently working on the SNB BI workload. Andrey Gubichev of TU Munchen and myself are going through the queries and are playing with two SQL based implementations, one on Virtuoso and the other on Hyper.

+

As discussed before, the BI workload has the same choke points as TPC-H as a base but pushes further in terms of graphiness and query complexity.

+

There are obvious marketing applications for a SNB-like dataset. There are also security …

+ +
+
+ +
+ + +
+
+
+ +

Sizing AWS Instances for the Semantic Publishing Benchmark

+
Tags:
+ SPB + , AMAZON + , EC2 + , AWS + , RDF + +
+
+ +

LDBC’s Semantic Publishing Benchmark (SPB) measures the performance of an RDF database in a load typical for metadata-based content publishing, such as the famous BBC Dynamic Semantic Publishing scenario. Such load combines tens of updates per second (e.g. adding metadata about new articles) with even higher volume of read requests (SPARQL queries collecting recent content and data to generate web page on a specific subject, e.g. Frank …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: a Realistic Social Network Data Generator

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

In previous posts (Getting started with snb, DATAGEN: data generation for the Social Network Benchmark), Arnau Prat discussed the main features and characteristics of DATAGEN: realism, scalability, determinism, usability. DATAGEN is the social network data generator used by the three LDBC-SNB workloads, which produces data simulating the activity in a social network site during a period of time. In this post, we conduct a series of experiments …

+ +
+
+ +
+ + +
+
+
+ +

SNB Driver - Part 1

+
Tags:
+ SNB + , DRIVER + , TPC-C + , INTERACTIVE + +
+
+ +

In this multi-part blog we consider the challenge of running the LDBC Social Network Interactive Benchmark (LDBC SNB) workload in parallel, i.e. the design of the workload driver that will issue the queries against the System Under Test (SUT). We go through design principles that were implemented for the LDBC SNB workload generator/load tester (simply referred to as driver). Software and documentation for this driver is available here: …

+ +
+
+ +
+ + +
+
+
+ +

Making Semantic Publishing Execution Rules

+
Tags:
+ SPB + , TEST RUN + +
+
+ +

LDBC SPB (Semantic Publishing Benchmark) is based on the BBC linked data platform use case. Thus the data modelling and transaction mix reflects the BBC’s actual utilization of RDF. But a benchmark is not only a condensation of current best practices. The BBC linked data platform is an Ontotext Graph DB deployment. Graph DB was formerly known as Owlim.

+

So, in SPB we wanted to address substantially more complex queries than the lookups that …

+ +
+
+ +
+ + +
+
+ +
+ +

The Semantic Publishing Benchmark (SPB), developed in the context of LDBC, aims at measuring the read and write operations that can be performed in the context of a media organisation. It simulates the management and consumption of RDF metadata describing media assets and creative works. The scenario is based around a media organisation that maintains RDF descriptions of its catalogue of creative works. These descriptions use a set of ontologies …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/6/index.html b/jan-2-2006/page/6/index.html new file mode 100644 index 00000000..6c797318 --- /dev/null +++ b/jan-2-2006/page/6/index.html @@ -0,0 +1,772 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Choke Point Based Benchmark Design

+
Tags:
+ DATABASE + , BENCHMARK + , DESIGN + +
+
+ +

The Linked Data Benchmark Council (LDBC) mission is to design and maintain benchmarks for graph data management systems, and establish and enforce standards in running these benchmarks, and publish and arbitrate around the official benchmark results. The council and its https://ldbcouncil.org website just launched, and in its first 1.5 year of existence, most effort at LDBC has gone into investigating the needs of the field through interaction …

+ +
+
+ +
+ + +
+
+
+ +

New Website Online LDBC Benchmarks Reach Public Draft

+
Tags:
+ DEVELOPER + , INDUSTRY + +
+
+ +

The Linked Data Benchmark Council (LDBC) is reaching a milestone today, June 23 2014, in announcing that two of the benchmarks that it has been developing since 1.5 years have now reached the status of Public Draft. This concerns the Semantic Publishing Benchmark (SPB) and the interactive workload of the Social Network Benchmark (SNB). In case of LDBC, the release is staged: now the benchmark software just runs read-only queries. This will be …

+ +
+
+ +
+ + +
+
+
+ +

Social Network Benchmark Goals

+
Tags:
+ SNB + , DATAGEN + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ +

Social Network interaction is amongst the most natural and widely spread activities in the internet society, and it has turned out to be a very useful way for people to socialise at different levels (friendship, professional, hobby, etc.). As such, Social Networks are well understood from the point of view of the data involved and the interaction required by their actors. Thus, the concepts of friends of friends, or retweet are well established …

+ +
+
+ +
+ + +
+
+ +
+ +

It is with great pleasure that we announce the new LDBC organisation site at www.ldbcouncil.org. The LDBC started as a European Community FP7 funded project with the objective to create, foster and become an industry reference for benchmarking RDF and Graph technologies. A period of more than one and a half years has led us to the creation of the first two workloads, the Semantic Publishing Benchmark and the Social Network Benchmark in its …

+ +
+
+ +
+ + +
+
+
+ +

2nd International Workshop on Benchmarking RDF Systems

+
Tags:
+ WORKSHOP + , CFP + , BENCHMARK + , BERSYS + +
+
+ +

Following the 1st International workshop on Benchmarking RDF Systems (BeRSys 2013) the aim of the BeRSys 2014 workshop is to provide a discussion forum where researchers and industrials can meet to discuss topics related to the performance of RDF systems. BeRSys 2014 is the only workshop dedicated to benchmarking different aspects of RDF engines - in the line of TPCTC series of workshops.The focus of the workshop is to expose and initiate …

+ +
+
+ +
+ + +
+
+
+ +

DATAGEN: Data Generation for the Social Network Benchmark

+
Tags:
+ DATAGEN + , SOCIAL NETWORK + , SNB + +
+
+ +

As explained in a previous post, the LDBC Social Network Benchmark (LDBC-SNB) has the objective to provide a realistic yet challenging workload, consisting of a social network and a set of queries. Both have to be realistic, easy to understand and easy to generate. This post has the objective to discuss the main features of DATAGEN, the social network data generator provided by LDBC-SNB, which is an evolution of S3G2 [1].

+

One of the most …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/7/index.html b/jan-2-2006/page/7/index.html new file mode 100644 index 00000000..ebb8c1a5 --- /dev/null +++ b/jan-2-2006/page/7/index.html @@ -0,0 +1,787 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Getting Started With SNB

+
Tags:
+ SNB + , INTERACTIVE + , DATAGEN + +
+
+ +

In a previous blog post titled “Is SNB like Facebook’s LinkBench?”, Peter Boncz discusses the design philosophy that shapes SNB and how it compares to other existing benchmarks such as LinkBench. In this post, I will briefly introduce the essential parts forming SNB, which are DATAGEN, the LDBC execution driver and the workloads.

+

DATAGEN

+

DATAGEN is the data generator used by all the workloads of SNB. Here we introduced the …

+ +
+
+ +
+ + +
+
+ +
+ +

The LDBC Social Network Benchmark (SNB) is composed of three distinct workloads, interactive, business intelligence and graph analytics. This post introduces the interactive workload.

+

The benchmark measures the speed of queries of medium complexity against a social network being constantly updated. The queries are scoped to a user’s social environment and potentially access data associated with the friends or a user and their friends.

+

This …

+ +
+
+ +
+ + +
+
+
+ +

Is SNB Like Facebooks LinkBench

+
Tags:
+ DEVELOPER + , SNB + , INTERACTIVE + , BI + , GRAPHALYTICS + +
+
+ + + post/is-snb-like-facebooks-linkbench/SNB-workloads-vs-systems.jpg +
+ +
+ +

In this post, I will discuss in some detail the rationale and goals of the design of the Social Network Benchmark (SNB) and explain how it relates to real …

+ +
+
+ +
+ + +
+
+
+ +

Making It Interactive

+
Tags:
+ SNB + , BENCHMARKING + , TPC + , SPARQL + , INTERACTIVE + +
+
+ +

Synopsis: Now is the time to finalize the interactive part of the Social Network Benchmark (SNB). The benchmark must be both credible in a real social network setting and pose new challenges. There are many hard queries but not enough representation for what online systems in fact do. So, the workload mix must strike a balance between the practice and presenting new challenges.

+

It is about to be showtime for LDBC. The initial installment of the …

+ +
+
+ +
+ + +
+
+
+ +

SNB Data Generator - Getting Started

+
Tags:
+ DATAGEN + , SNB + , SOCIAL NETWORK + +
+
+ +

In previous posts (this and this) we briefly introduced the design goals and philosophy behind DATAGEN, the data generator used in LDBC-SNB. In this post, I will explain how to use DATAGEN to generate the necessary datatsets to run LDBC-SNB. Of course, as DATAGEN is continuously under development, the instructions given in this tutorial might change in the future.

+

Getting and Configuring Hadoop

+

DATAGEN runs on top of hadoop 1.2.1 to be scale. …

+ +
+
+ +
+ + +
+
+
+ +

The Day of Graph Analytics

+
Tags:
+ ANALYTICS + , SNB + +
+
+ +

Note: consider this post as a continuation of the “Making it interactive” post by Orri Erling.

+

I have now completed the Virtuoso TPC-H work, including scale out. Optimization possibilities extend to infinity but the present level is good enough. TPC-H is the classic of all analytics benchmarks and is difficult enough, I have extensive commentary on this on my blog (In Hoc Signo Vinces series), including experimental results. This is, …

+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jan-2-2006/page/8/index.html b/jan-2-2006/page/8/index.html new file mode 100644 index 00000000..7ab0e0b9 --- /dev/null +++ b/jan-2-2006/page/8/index.html @@ -0,0 +1,658 @@ + + + + + Jan 2, 2006 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + +
+ + + + + +
+ + + +
+
+
+
+ + +

Jan 2, 2006

+ + + + +
+
+
+
+ + + + +
+
+
+
+
+ + + +
+
+
+ +

Using LDBC SPB to Find OWLIM Performance Issues

+
Tags:
+ LDBC + , SPB + , RDF + +
+
+ +

During the past six months we (the OWLIM Team at Ontotext) have integrated the LDBC Semantic Publishing Benchmark (LDBC-SPB) as a part of our development and release process.

+

First thing we’ve started using the LDBC-SPB for is to monitor the performance of our RDF Store when a new release is about to come out.

+

Initially we’ve decided to fix some of the benchmark parameters :

+
    +
  • the dataset size - 50 million triples (LDBC-SPB50) * benchmark warmup …
+ +
+
+ +
+ + +
+ + + + + + + + + + + + + +
+ +
+
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/js/_vendor/bootstrap.bundle.min.js b/js/_vendor/bootstrap.bundle.min.js new file mode 100644 index 00000000..0ded08ee --- /dev/null +++ b/js/_vendor/bootstrap.bundle.min.js @@ -0,0 +1,6 @@ +/*! + * Bootstrap v4.6.0 (https://getbootstrap.com/) + * Copyright 2011-2021 The Bootstrap Authors (https://github.com/twbs/bootstrap/graphs/contributors) + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/main/LICENSE) + */ +!function(t,e){"object"==typeof exports&&"undefined"!=typeof module?e(exports,require("jquery")):"function"==typeof define&&define.amd?define(["exports","jquery"],e):e((t="undefined"!=typeof globalThis?globalThis:t||self).bootstrap={},t.jQuery)}(this,(function(t,e){"use strict";function n(t){return t&&"object"==typeof t&&"default"in t?t:{default:t}}var i=n(e);function o(t,e){for(var n=0;n=4)throw new Error("Bootstrap's JavaScript requires at least jQuery v1.9.1 but less than v4.0.0")}};l.jQueryDetection(),i.default.fn.emulateTransitionEnd=s,i.default.event.special[l.TRANSITION_END]={bindType:"transitionend",delegateType:"transitionend",handle:function(t){if(i.default(t.target).is(this))return t.handleObj.handler.apply(this,arguments)}};var u="alert",f=i.default.fn[u],d=function(){function t(t){this._element=t}var e=t.prototype;return e.close=function(t){var e=this._element;t&&(e=this._getRootElement(t)),this._triggerCloseEvent(e).isDefaultPrevented()||this._removeElement(e)},e.dispose=function(){i.default.removeData(this._element,"bs.alert"),this._element=null},e._getRootElement=function(t){var e=l.getSelectorFromElement(t),n=!1;return e&&(n=document.querySelector(e)),n||(n=i.default(t).closest(".alert")[0]),n},e._triggerCloseEvent=function(t){var e=i.default.Event("close.bs.alert");return i.default(t).trigger(e),e},e._removeElement=function(t){var e=this;if(i.default(t).removeClass("show"),i.default(t).hasClass("fade")){var n=l.getTransitionDurationFromElement(t);i.default(t).one(l.TRANSITION_END,(function(n){return e._destroyElement(t,n)})).emulateTransitionEnd(n)}else this._destroyElement(t)},e._destroyElement=function(t){i.default(t).detach().trigger("closed.bs.alert").remove()},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this),o=n.data("bs.alert");o||(o=new t(this),n.data("bs.alert",o)),"close"===e&&o[e](this)}))},t._handleDismiss=function(t){return function(e){e&&e.preventDefault(),t.close(this)}},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}}]),t}();i.default(document).on("click.bs.alert.data-api",'[data-dismiss="alert"]',d._handleDismiss(new d)),i.default.fn[u]=d._jQueryInterface,i.default.fn[u].Constructor=d,i.default.fn[u].noConflict=function(){return i.default.fn[u]=f,d._jQueryInterface};var c=i.default.fn.button,h=function(){function t(t){this._element=t,this.shouldAvoidTriggerChange=!1}var e=t.prototype;return e.toggle=function(){var t=!0,e=!0,n=i.default(this._element).closest('[data-toggle="buttons"]')[0];if(n){var o=this._element.querySelector('input:not([type="hidden"])');if(o){if("radio"===o.type)if(o.checked&&this._element.classList.contains("active"))t=!1;else{var r=n.querySelector(".active");r&&i.default(r).removeClass("active")}t&&("checkbox"!==o.type&&"radio"!==o.type||(o.checked=!this._element.classList.contains("active")),this.shouldAvoidTriggerChange||i.default(o).trigger("change")),o.focus(),e=!1}}this._element.hasAttribute("disabled")||this._element.classList.contains("disabled")||(e&&this._element.setAttribute("aria-pressed",!this._element.classList.contains("active")),t&&i.default(this._element).toggleClass("active"))},e.dispose=function(){i.default.removeData(this._element,"bs.button"),this._element=null},t._jQueryInterface=function(e,n){return this.each((function(){var o=i.default(this),r=o.data("bs.button");r||(r=new t(this),o.data("bs.button",r)),r.shouldAvoidTriggerChange=n,"toggle"===e&&r[e]()}))},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}}]),t}();i.default(document).on("click.bs.button.data-api",'[data-toggle^="button"]',(function(t){var e=t.target,n=e;if(i.default(e).hasClass("btn")||(e=i.default(e).closest(".btn")[0]),!e||e.hasAttribute("disabled")||e.classList.contains("disabled"))t.preventDefault();else{var o=e.querySelector('input:not([type="hidden"])');if(o&&(o.hasAttribute("disabled")||o.classList.contains("disabled")))return void t.preventDefault();"INPUT"!==n.tagName&&"LABEL"===e.tagName||h._jQueryInterface.call(i.default(e),"toggle","INPUT"===n.tagName)}})).on("focus.bs.button.data-api blur.bs.button.data-api",'[data-toggle^="button"]',(function(t){var e=i.default(t.target).closest(".btn")[0];i.default(e).toggleClass("focus",/^focus(in)?$/.test(t.type))})),i.default(window).on("load.bs.button.data-api",(function(){for(var t=[].slice.call(document.querySelectorAll('[data-toggle="buttons"] .btn')),e=0,n=t.length;e0,this._pointerEvent=Boolean(window.PointerEvent||window.MSPointerEvent),this._addEventListeners()}var e=t.prototype;return e.next=function(){this._isSliding||this._slide("next")},e.nextWhenVisible=function(){var t=i.default(this._element);!document.hidden&&t.is(":visible")&&"hidden"!==t.css("visibility")&&this.next()},e.prev=function(){this._isSliding||this._slide("prev")},e.pause=function(t){t||(this._isPaused=!0),this._element.querySelector(".carousel-item-next, .carousel-item-prev")&&(l.triggerTransitionEnd(this._element),this.cycle(!0)),clearInterval(this._interval),this._interval=null},e.cycle=function(t){t||(this._isPaused=!1),this._interval&&(clearInterval(this._interval),this._interval=null),this._config.interval&&!this._isPaused&&(this._updateInterval(),this._interval=setInterval((document.visibilityState?this.nextWhenVisible:this.next).bind(this),this._config.interval))},e.to=function(t){var e=this;this._activeElement=this._element.querySelector(".active.carousel-item");var n=this._getItemIndex(this._activeElement);if(!(t>this._items.length-1||t<0))if(this._isSliding)i.default(this._element).one("slid.bs.carousel",(function(){return e.to(t)}));else{if(n===t)return this.pause(),void this.cycle();var o=t>n?"next":"prev";this._slide(o,this._items[t])}},e.dispose=function(){i.default(this._element).off(m),i.default.removeData(this._element,"bs.carousel"),this._items=null,this._config=null,this._element=null,this._interval=null,this._isPaused=null,this._isSliding=null,this._activeElement=null,this._indicatorsElement=null},e._getConfig=function(t){return t=a({},v,t),l.typeCheckConfig(p,t,_),t},e._handleSwipe=function(){var t=Math.abs(this.touchDeltaX);if(!(t<=40)){var e=t/this.touchDeltaX;this.touchDeltaX=0,e>0&&this.prev(),e<0&&this.next()}},e._addEventListeners=function(){var t=this;this._config.keyboard&&i.default(this._element).on("keydown.bs.carousel",(function(e){return t._keydown(e)})),"hover"===this._config.pause&&i.default(this._element).on("mouseenter.bs.carousel",(function(e){return t.pause(e)})).on("mouseleave.bs.carousel",(function(e){return t.cycle(e)})),this._config.touch&&this._addTouchEventListeners()},e._addTouchEventListeners=function(){var t=this;if(this._touchSupported){var e=function(e){t._pointerEvent&&b[e.originalEvent.pointerType.toUpperCase()]?t.touchStartX=e.originalEvent.clientX:t._pointerEvent||(t.touchStartX=e.originalEvent.touches[0].clientX)},n=function(e){t._pointerEvent&&b[e.originalEvent.pointerType.toUpperCase()]&&(t.touchDeltaX=e.originalEvent.clientX-t.touchStartX),t._handleSwipe(),"hover"===t._config.pause&&(t.pause(),t.touchTimeout&&clearTimeout(t.touchTimeout),t.touchTimeout=setTimeout((function(e){return t.cycle(e)}),500+t._config.interval))};i.default(this._element.querySelectorAll(".carousel-item img")).on("dragstart.bs.carousel",(function(t){return t.preventDefault()})),this._pointerEvent?(i.default(this._element).on("pointerdown.bs.carousel",(function(t){return e(t)})),i.default(this._element).on("pointerup.bs.carousel",(function(t){return n(t)})),this._element.classList.add("pointer-event")):(i.default(this._element).on("touchstart.bs.carousel",(function(t){return e(t)})),i.default(this._element).on("touchmove.bs.carousel",(function(e){return function(e){e.originalEvent.touches&&e.originalEvent.touches.length>1?t.touchDeltaX=0:t.touchDeltaX=e.originalEvent.touches[0].clientX-t.touchStartX}(e)})),i.default(this._element).on("touchend.bs.carousel",(function(t){return n(t)})))}},e._keydown=function(t){if(!/input|textarea/i.test(t.target.tagName))switch(t.which){case 37:t.preventDefault(),this.prev();break;case 39:t.preventDefault(),this.next()}},e._getItemIndex=function(t){return this._items=t&&t.parentNode?[].slice.call(t.parentNode.querySelectorAll(".carousel-item")):[],this._items.indexOf(t)},e._getItemByDirection=function(t,e){var n="next"===t,i="prev"===t,o=this._getItemIndex(e),r=this._items.length-1;if((i&&0===o||n&&o===r)&&!this._config.wrap)return e;var a=(o+("prev"===t?-1:1))%this._items.length;return-1===a?this._items[this._items.length-1]:this._items[a]},e._triggerSlideEvent=function(t,e){var n=this._getItemIndex(t),o=this._getItemIndex(this._element.querySelector(".active.carousel-item")),r=i.default.Event("slide.bs.carousel",{relatedTarget:t,direction:e,from:o,to:n});return i.default(this._element).trigger(r),r},e._setActiveIndicatorElement=function(t){if(this._indicatorsElement){var e=[].slice.call(this._indicatorsElement.querySelectorAll(".active"));i.default(e).removeClass("active");var n=this._indicatorsElement.children[this._getItemIndex(t)];n&&i.default(n).addClass("active")}},e._updateInterval=function(){var t=this._activeElement||this._element.querySelector(".active.carousel-item");if(t){var e=parseInt(t.getAttribute("data-interval"),10);e?(this._config.defaultInterval=this._config.defaultInterval||this._config.interval,this._config.interval=e):this._config.interval=this._config.defaultInterval||this._config.interval}},e._slide=function(t,e){var n,o,r,a=this,s=this._element.querySelector(".active.carousel-item"),u=this._getItemIndex(s),f=e||s&&this._getItemByDirection(t,s),d=this._getItemIndex(f),c=Boolean(this._interval);if("next"===t?(n="carousel-item-left",o="carousel-item-next",r="left"):(n="carousel-item-right",o="carousel-item-prev",r="right"),f&&i.default(f).hasClass("active"))this._isSliding=!1;else if(!this._triggerSlideEvent(f,r).isDefaultPrevented()&&s&&f){this._isSliding=!0,c&&this.pause(),this._setActiveIndicatorElement(f),this._activeElement=f;var h=i.default.Event("slid.bs.carousel",{relatedTarget:f,direction:r,from:u,to:d});if(i.default(this._element).hasClass("slide")){i.default(f).addClass(o),l.reflow(f),i.default(s).addClass(n),i.default(f).addClass(n);var p=l.getTransitionDurationFromElement(s);i.default(s).one(l.TRANSITION_END,(function(){i.default(f).removeClass(n+" "+o).addClass("active"),i.default(s).removeClass("active "+o+" "+n),a._isSliding=!1,setTimeout((function(){return i.default(a._element).trigger(h)}),0)})).emulateTransitionEnd(p)}else i.default(s).removeClass("active"),i.default(f).addClass("active"),this._isSliding=!1,i.default(this._element).trigger(h);c&&this.cycle()}},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this).data("bs.carousel"),o=a({},v,i.default(this).data());"object"==typeof e&&(o=a({},o,e));var r="string"==typeof e?e:o.slide;if(n||(n=new t(this,o),i.default(this).data("bs.carousel",n)),"number"==typeof e)n.to(e);else if("string"==typeof r){if("undefined"==typeof n[r])throw new TypeError('No method named "'+r+'"');n[r]()}else o.interval&&o.ride&&(n.pause(),n.cycle())}))},t._dataApiClickHandler=function(e){var n=l.getSelectorFromElement(this);if(n){var o=i.default(n)[0];if(o&&i.default(o).hasClass("carousel")){var r=a({},i.default(o).data(),i.default(this).data()),s=this.getAttribute("data-slide-to");s&&(r.interval=!1),t._jQueryInterface.call(i.default(o),r),s&&i.default(o).data("bs.carousel").to(s),e.preventDefault()}}},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}},{key:"Default",get:function(){return v}}]),t}();i.default(document).on("click.bs.carousel.data-api","[data-slide], [data-slide-to]",y._dataApiClickHandler),i.default(window).on("load.bs.carousel.data-api",(function(){for(var t=[].slice.call(document.querySelectorAll('[data-ride="carousel"]')),e=0,n=t.length;e0&&(this._selector=a,this._triggerArray.push(r))}this._parent=this._config.parent?this._getParent():null,this._config.parent||this._addAriaAndCollapsedClass(this._element,this._triggerArray),this._config.toggle&&this.toggle()}var e=t.prototype;return e.toggle=function(){i.default(this._element).hasClass("show")?this.hide():this.show()},e.show=function(){var e,n,o=this;if(!this._isTransitioning&&!i.default(this._element).hasClass("show")&&(this._parent&&0===(e=[].slice.call(this._parent.querySelectorAll(".show, .collapsing")).filter((function(t){return"string"==typeof o._config.parent?t.getAttribute("data-parent")===o._config.parent:t.classList.contains("collapse")}))).length&&(e=null),!(e&&(n=i.default(e).not(this._selector).data("bs.collapse"))&&n._isTransitioning))){var r=i.default.Event("show.bs.collapse");if(i.default(this._element).trigger(r),!r.isDefaultPrevented()){e&&(t._jQueryInterface.call(i.default(e).not(this._selector),"hide"),n||i.default(e).data("bs.collapse",null));var a=this._getDimension();i.default(this._element).removeClass("collapse").addClass("collapsing"),this._element.style[a]=0,this._triggerArray.length&&i.default(this._triggerArray).removeClass("collapsed").attr("aria-expanded",!0),this.setTransitioning(!0);var s="scroll"+(a[0].toUpperCase()+a.slice(1)),u=l.getTransitionDurationFromElement(this._element);i.default(this._element).one(l.TRANSITION_END,(function(){i.default(o._element).removeClass("collapsing").addClass("collapse show"),o._element.style[a]="",o.setTransitioning(!1),i.default(o._element).trigger("shown.bs.collapse")})).emulateTransitionEnd(u),this._element.style[a]=this._element[s]+"px"}}},e.hide=function(){var t=this;if(!this._isTransitioning&&i.default(this._element).hasClass("show")){var e=i.default.Event("hide.bs.collapse");if(i.default(this._element).trigger(e),!e.isDefaultPrevented()){var n=this._getDimension();this._element.style[n]=this._element.getBoundingClientRect()[n]+"px",l.reflow(this._element),i.default(this._element).addClass("collapsing").removeClass("collapse show");var o=this._triggerArray.length;if(o>0)for(var r=0;r=0)return 1;return 0}();var k=D&&window.Promise?function(t){var e=!1;return function(){e||(e=!0,window.Promise.resolve().then((function(){e=!1,t()})))}}:function(t){var e=!1;return function(){e||(e=!0,setTimeout((function(){e=!1,t()}),N))}};function A(t){return t&&"[object Function]"==={}.toString.call(t)}function I(t,e){if(1!==t.nodeType)return[];var n=t.ownerDocument.defaultView.getComputedStyle(t,null);return e?n[e]:n}function O(t){return"HTML"===t.nodeName?t:t.parentNode||t.host}function x(t){if(!t)return document.body;switch(t.nodeName){case"HTML":case"BODY":return t.ownerDocument.body;case"#document":return t.body}var e=I(t),n=e.overflow,i=e.overflowX,o=e.overflowY;return/(auto|scroll|overlay)/.test(n+o+i)?t:x(O(t))}function j(t){return t&&t.referenceNode?t.referenceNode:t}var L=D&&!(!window.MSInputMethodContext||!document.documentMode),P=D&&/MSIE 10/.test(navigator.userAgent);function F(t){return 11===t?L:10===t?P:L||P}function R(t){if(!t)return document.documentElement;for(var e=F(10)?document.body:null,n=t.offsetParent||null;n===e&&t.nextElementSibling;)n=(t=t.nextElementSibling).offsetParent;var i=n&&n.nodeName;return i&&"BODY"!==i&&"HTML"!==i?-1!==["TH","TD","TABLE"].indexOf(n.nodeName)&&"static"===I(n,"position")?R(n):n:t?t.ownerDocument.documentElement:document.documentElement}function H(t){return null!==t.parentNode?H(t.parentNode):t}function M(t,e){if(!(t&&t.nodeType&&e&&e.nodeType))return document.documentElement;var n=t.compareDocumentPosition(e)&Node.DOCUMENT_POSITION_FOLLOWING,i=n?t:e,o=n?e:t,r=document.createRange();r.setStart(i,0),r.setEnd(o,0);var a,s,l=r.commonAncestorContainer;if(t!==l&&e!==l||i.contains(o))return"BODY"===(s=(a=l).nodeName)||"HTML"!==s&&R(a.firstElementChild)!==a?R(l):l;var u=H(t);return u.host?M(u.host,e):M(t,H(e).host)}function q(t){var e=arguments.length>1&&void 0!==arguments[1]?arguments[1]:"top",n="top"===e?"scrollTop":"scrollLeft",i=t.nodeName;if("BODY"===i||"HTML"===i){var o=t.ownerDocument.documentElement,r=t.ownerDocument.scrollingElement||o;return r[n]}return t[n]}function B(t,e){var n=arguments.length>2&&void 0!==arguments[2]&&arguments[2],i=q(e,"top"),o=q(e,"left"),r=n?-1:1;return t.top+=i*r,t.bottom+=i*r,t.left+=o*r,t.right+=o*r,t}function Q(t,e){var n="x"===e?"Left":"Top",i="Left"===n?"Right":"Bottom";return parseFloat(t["border"+n+"Width"])+parseFloat(t["border"+i+"Width"])}function W(t,e,n,i){return Math.max(e["offset"+t],e["scroll"+t],n["client"+t],n["offset"+t],n["scroll"+t],F(10)?parseInt(n["offset"+t])+parseInt(i["margin"+("Height"===t?"Top":"Left")])+parseInt(i["margin"+("Height"===t?"Bottom":"Right")]):0)}function U(t){var e=t.body,n=t.documentElement,i=F(10)&&getComputedStyle(n);return{height:W("Height",e,n,i),width:W("Width",e,n,i)}}var V=function(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")},Y=function(){function t(t,e){for(var n=0;n2&&void 0!==arguments[2]&&arguments[2],i=F(10),o="HTML"===e.nodeName,r=G(t),a=G(e),s=x(t),l=I(e),u=parseFloat(l.borderTopWidth),f=parseFloat(l.borderLeftWidth);n&&o&&(a.top=Math.max(a.top,0),a.left=Math.max(a.left,0));var d=K({top:r.top-a.top-u,left:r.left-a.left-f,width:r.width,height:r.height});if(d.marginTop=0,d.marginLeft=0,!i&&o){var c=parseFloat(l.marginTop),h=parseFloat(l.marginLeft);d.top-=u-c,d.bottom-=u-c,d.left-=f-h,d.right-=f-h,d.marginTop=c,d.marginLeft=h}return(i&&!n?e.contains(s):e===s&&"BODY"!==s.nodeName)&&(d=B(d,e)),d}function J(t){var e=arguments.length>1&&void 0!==arguments[1]&&arguments[1],n=t.ownerDocument.documentElement,i=$(t,n),o=Math.max(n.clientWidth,window.innerWidth||0),r=Math.max(n.clientHeight,window.innerHeight||0),a=e?0:q(n),s=e?0:q(n,"left"),l={top:a-i.top+i.marginTop,left:s-i.left+i.marginLeft,width:o,height:r};return K(l)}function Z(t){var e=t.nodeName;if("BODY"===e||"HTML"===e)return!1;if("fixed"===I(t,"position"))return!0;var n=O(t);return!!n&&Z(n)}function tt(t){if(!t||!t.parentElement||F())return document.documentElement;for(var e=t.parentElement;e&&"none"===I(e,"transform");)e=e.parentElement;return e||document.documentElement}function et(t,e,n,i){var o=arguments.length>4&&void 0!==arguments[4]&&arguments[4],r={top:0,left:0},a=o?tt(t):M(t,j(e));if("viewport"===i)r=J(a,o);else{var s=void 0;"scrollParent"===i?"BODY"===(s=x(O(e))).nodeName&&(s=t.ownerDocument.documentElement):s="window"===i?t.ownerDocument.documentElement:i;var l=$(s,a,o);if("HTML"!==s.nodeName||Z(a))r=l;else{var u=U(t.ownerDocument),f=u.height,d=u.width;r.top+=l.top-l.marginTop,r.bottom=f+l.top,r.left+=l.left-l.marginLeft,r.right=d+l.left}}var c="number"==typeof(n=n||0);return r.left+=c?n:n.left||0,r.top+=c?n:n.top||0,r.right-=c?n:n.right||0,r.bottom-=c?n:n.bottom||0,r}function nt(t){return t.width*t.height}function it(t,e,n,i,o){var r=arguments.length>5&&void 0!==arguments[5]?arguments[5]:0;if(-1===t.indexOf("auto"))return t;var a=et(n,i,r,o),s={top:{width:a.width,height:e.top-a.top},right:{width:a.right-e.right,height:a.height},bottom:{width:a.width,height:a.bottom-e.bottom},left:{width:e.left-a.left,height:a.height}},l=Object.keys(s).map((function(t){return X({key:t},s[t],{area:nt(s[t])})})).sort((function(t,e){return e.area-t.area})),u=l.filter((function(t){var e=t.width,i=t.height;return e>=n.clientWidth&&i>=n.clientHeight})),f=u.length>0?u[0].key:l[0].key,d=t.split("-")[1];return f+(d?"-"+d:"")}function ot(t,e,n){var i=arguments.length>3&&void 0!==arguments[3]?arguments[3]:null,o=i?tt(e):M(e,j(n));return $(n,o,i)}function rt(t){var e=t.ownerDocument.defaultView.getComputedStyle(t),n=parseFloat(e.marginTop||0)+parseFloat(e.marginBottom||0),i=parseFloat(e.marginLeft||0)+parseFloat(e.marginRight||0);return{width:t.offsetWidth+i,height:t.offsetHeight+n}}function at(t){var e={left:"right",right:"left",bottom:"top",top:"bottom"};return t.replace(/left|right|bottom|top/g,(function(t){return e[t]}))}function st(t,e,n){n=n.split("-")[0];var i=rt(t),o={width:i.width,height:i.height},r=-1!==["right","left"].indexOf(n),a=r?"top":"left",s=r?"left":"top",l=r?"height":"width",u=r?"width":"height";return o[a]=e[a]+e[l]/2-i[l]/2,o[s]=n===s?e[s]-i[u]:e[at(s)],o}function lt(t,e){return Array.prototype.find?t.find(e):t.filter(e)[0]}function ut(t,e,n){return(void 0===n?t:t.slice(0,function(t,e,n){if(Array.prototype.findIndex)return t.findIndex((function(t){return t[e]===n}));var i=lt(t,(function(t){return t[e]===n}));return t.indexOf(i)}(t,"name",n))).forEach((function(t){t.function&&console.warn("`modifier.function` is deprecated, use `modifier.fn`!");var n=t.function||t.fn;t.enabled&&A(n)&&(e.offsets.popper=K(e.offsets.popper),e.offsets.reference=K(e.offsets.reference),e=n(e,t))})),e}function ft(){if(!this.state.isDestroyed){var t={instance:this,styles:{},arrowStyles:{},attributes:{},flipped:!1,offsets:{}};t.offsets.reference=ot(this.state,this.popper,this.reference,this.options.positionFixed),t.placement=it(this.options.placement,t.offsets.reference,this.popper,this.reference,this.options.modifiers.flip.boundariesElement,this.options.modifiers.flip.padding),t.originalPlacement=t.placement,t.positionFixed=this.options.positionFixed,t.offsets.popper=st(this.popper,t.offsets.reference,t.placement),t.offsets.popper.position=this.options.positionFixed?"fixed":"absolute",t=ut(this.modifiers,t),this.state.isCreated?this.options.onUpdate(t):(this.state.isCreated=!0,this.options.onCreate(t))}}function dt(t,e){return t.some((function(t){var n=t.name;return t.enabled&&n===e}))}function ct(t){for(var e=[!1,"ms","Webkit","Moz","O"],n=t.charAt(0).toUpperCase()+t.slice(1),i=0;i1&&void 0!==arguments[1]&&arguments[1],n=Tt.indexOf(t),i=Tt.slice(n+1).concat(Tt.slice(0,n));return e?i.reverse():i}var St="flip",Dt="clockwise",Nt="counterclockwise";function kt(t,e,n,i){var o=[0,0],r=-1!==["right","left"].indexOf(i),a=t.split(/(\+|\-)/).map((function(t){return t.trim()})),s=a.indexOf(lt(a,(function(t){return-1!==t.search(/,|\s/)})));a[s]&&-1===a[s].indexOf(",")&&console.warn("Offsets separated by white space(s) are deprecated, use a comma (,) instead.");var l=/\s*,\s*|\s+/,u=-1!==s?[a.slice(0,s).concat([a[s].split(l)[0]]),[a[s].split(l)[1]].concat(a.slice(s+1))]:[a];return(u=u.map((function(t,i){var o=(1===i?!r:r)?"height":"width",a=!1;return t.reduce((function(t,e){return""===t[t.length-1]&&-1!==["+","-"].indexOf(e)?(t[t.length-1]=e,a=!0,t):a?(t[t.length-1]+=e,a=!1,t):t.concat(e)}),[]).map((function(t){return function(t,e,n,i){var o=t.match(/((?:\-|\+)?\d*\.?\d*)(.*)/),r=+o[1],a=o[2];if(!r)return t;if(0===a.indexOf("%")){var s=void 0;switch(a){case"%p":s=n;break;case"%":case"%r":default:s=i}return K(s)[e]/100*r}if("vh"===a||"vw"===a)return("vh"===a?Math.max(document.documentElement.clientHeight,window.innerHeight||0):Math.max(document.documentElement.clientWidth,window.innerWidth||0))/100*r;return r}(t,o,e,n)}))}))).forEach((function(t,e){t.forEach((function(n,i){_t(n)&&(o[e]+=n*("-"===t[i-1]?-1:1))}))})),o}var At={placement:"bottom",positionFixed:!1,eventsEnabled:!0,removeOnDestroy:!1,onCreate:function(){},onUpdate:function(){},modifiers:{shift:{order:100,enabled:!0,fn:function(t){var e=t.placement,n=e.split("-")[0],i=e.split("-")[1];if(i){var o=t.offsets,r=o.reference,a=o.popper,s=-1!==["bottom","top"].indexOf(n),l=s?"left":"top",u=s?"width":"height",f={start:z({},l,r[l]),end:z({},l,r[l]+r[u]-a[u])};t.offsets.popper=X({},a,f[i])}return t}},offset:{order:200,enabled:!0,fn:function(t,e){var n=e.offset,i=t.placement,o=t.offsets,r=o.popper,a=o.reference,s=i.split("-")[0],l=void 0;return l=_t(+n)?[+n,0]:kt(n,r,a,s),"left"===s?(r.top+=l[0],r.left-=l[1]):"right"===s?(r.top+=l[0],r.left+=l[1]):"top"===s?(r.left+=l[0],r.top-=l[1]):"bottom"===s&&(r.left+=l[0],r.top+=l[1]),t.popper=r,t},offset:0},preventOverflow:{order:300,enabled:!0,fn:function(t,e){var n=e.boundariesElement||R(t.instance.popper);t.instance.reference===n&&(n=R(n));var i=ct("transform"),o=t.instance.popper.style,r=o.top,a=o.left,s=o[i];o.top="",o.left="",o[i]="";var l=et(t.instance.popper,t.instance.reference,e.padding,n,t.positionFixed);o.top=r,o.left=a,o[i]=s,e.boundaries=l;var u=e.priority,f=t.offsets.popper,d={primary:function(t){var n=f[t];return f[t]l[t]&&!e.escapeWithReference&&(i=Math.min(f[n],l[t]-("right"===t?f.width:f.height))),z({},n,i)}};return u.forEach((function(t){var e=-1!==["left","top"].indexOf(t)?"primary":"secondary";f=X({},f,d[e](t))})),t.offsets.popper=f,t},priority:["left","right","top","bottom"],padding:5,boundariesElement:"scrollParent"},keepTogether:{order:400,enabled:!0,fn:function(t){var e=t.offsets,n=e.popper,i=e.reference,o=t.placement.split("-")[0],r=Math.floor,a=-1!==["top","bottom"].indexOf(o),s=a?"right":"bottom",l=a?"left":"top",u=a?"width":"height";return n[s]r(i[s])&&(t.offsets.popper[l]=r(i[s])),t}},arrow:{order:500,enabled:!0,fn:function(t,e){var n;if(!wt(t.instance.modifiers,"arrow","keepTogether"))return t;var i=e.element;if("string"==typeof i){if(!(i=t.instance.popper.querySelector(i)))return t}else if(!t.instance.popper.contains(i))return console.warn("WARNING: `arrow.element` must be child of its popper element!"),t;var o=t.placement.split("-")[0],r=t.offsets,a=r.popper,s=r.reference,l=-1!==["left","right"].indexOf(o),u=l?"height":"width",f=l?"Top":"Left",d=f.toLowerCase(),c=l?"left":"top",h=l?"bottom":"right",p=rt(i)[u];s[h]-pa[h]&&(t.offsets.popper[d]+=s[d]+p-a[h]),t.offsets.popper=K(t.offsets.popper);var m=s[d]+s[u]/2-p/2,g=I(t.instance.popper),v=parseFloat(g["margin"+f]),_=parseFloat(g["border"+f+"Width"]),b=m-t.offsets.popper[d]-v-_;return b=Math.max(Math.min(a[u]-p,b),0),t.arrowElement=i,t.offsets.arrow=(z(n={},d,Math.round(b)),z(n,c,""),n),t},element:"[x-arrow]"},flip:{order:600,enabled:!0,fn:function(t,e){if(dt(t.instance.modifiers,"inner"))return t;if(t.flipped&&t.placement===t.originalPlacement)return t;var n=et(t.instance.popper,t.instance.reference,e.padding,e.boundariesElement,t.positionFixed),i=t.placement.split("-")[0],o=at(i),r=t.placement.split("-")[1]||"",a=[];switch(e.behavior){case St:a=[i,o];break;case Dt:a=Ct(i);break;case Nt:a=Ct(i,!0);break;default:a=e.behavior}return a.forEach((function(s,l){if(i!==s||a.length===l+1)return t;i=t.placement.split("-")[0],o=at(i);var u=t.offsets.popper,f=t.offsets.reference,d=Math.floor,c="left"===i&&d(u.right)>d(f.left)||"right"===i&&d(u.left)d(f.top)||"bottom"===i&&d(u.top)d(n.right),m=d(u.top)d(n.bottom),v="left"===i&&h||"right"===i&&p||"top"===i&&m||"bottom"===i&&g,_=-1!==["top","bottom"].indexOf(i),b=!!e.flipVariations&&(_&&"start"===r&&h||_&&"end"===r&&p||!_&&"start"===r&&m||!_&&"end"===r&&g),y=!!e.flipVariationsByContent&&(_&&"start"===r&&p||_&&"end"===r&&h||!_&&"start"===r&&g||!_&&"end"===r&&m),w=b||y;(c||v||w)&&(t.flipped=!0,(c||v)&&(i=a[l+1]),w&&(r=function(t){return"end"===t?"start":"start"===t?"end":t}(r)),t.placement=i+(r?"-"+r:""),t.offsets.popper=X({},t.offsets.popper,st(t.instance.popper,t.offsets.reference,t.placement)),t=ut(t.instance.modifiers,t,"flip"))})),t},behavior:"flip",padding:5,boundariesElement:"viewport",flipVariations:!1,flipVariationsByContent:!1},inner:{order:700,enabled:!1,fn:function(t){var e=t.placement,n=e.split("-")[0],i=t.offsets,o=i.popper,r=i.reference,a=-1!==["left","right"].indexOf(n),s=-1===["top","left"].indexOf(n);return o[a?"left":"top"]=r[n]-(s?o[a?"width":"height"]:0),t.placement=at(e),t.offsets.popper=K(o),t}},hide:{order:800,enabled:!0,fn:function(t){if(!wt(t.instance.modifiers,"hide","preventOverflow"))return t;var e=t.offsets.reference,n=lt(t.instance.modifiers,(function(t){return"preventOverflow"===t.name})).boundaries;if(e.bottomn.right||e.top>n.bottom||e.right2&&void 0!==arguments[2]?arguments[2]:{};V(this,t),this.scheduleUpdate=function(){return requestAnimationFrame(i.update)},this.update=k(this.update.bind(this)),this.options=X({},t.Defaults,o),this.state={isDestroyed:!1,isCreated:!1,scrollParents:[]},this.reference=e&&e.jquery?e[0]:e,this.popper=n&&n.jquery?n[0]:n,this.options.modifiers={},Object.keys(X({},t.Defaults.modifiers,o.modifiers)).forEach((function(e){i.options.modifiers[e]=X({},t.Defaults.modifiers[e]||{},o.modifiers?o.modifiers[e]:{})})),this.modifiers=Object.keys(this.options.modifiers).map((function(t){return X({name:t},i.options.modifiers[t])})).sort((function(t,e){return t.order-e.order})),this.modifiers.forEach((function(t){t.enabled&&A(t.onLoad)&&t.onLoad(i.reference,i.popper,i.options,t,i.state)})),this.update();var r=this.options.eventsEnabled;r&&this.enableEventListeners(),this.state.eventsEnabled=r}return Y(t,[{key:"update",value:function(){return ft.call(this)}},{key:"destroy",value:function(){return ht.call(this)}},{key:"enableEventListeners",value:function(){return gt.call(this)}},{key:"disableEventListeners",value:function(){return vt.call(this)}}]),t}();It.Utils=("undefined"!=typeof window?window:global).PopperUtils,It.placements=Et,It.Defaults=At;var Ot="dropdown",xt=i.default.fn[Ot],jt=new RegExp("38|40|27"),Lt={offset:0,flip:!0,boundary:"scrollParent",reference:"toggle",display:"dynamic",popperConfig:null},Pt={offset:"(number|string|function)",flip:"boolean",boundary:"(string|element)",reference:"(string|element)",display:"string",popperConfig:"(null|object)"},Ft=function(){function t(t,e){this._element=t,this._popper=null,this._config=this._getConfig(e),this._menu=this._getMenuElement(),this._inNavbar=this._detectNavbar(),this._addEventListeners()}var e=t.prototype;return e.toggle=function(){if(!this._element.disabled&&!i.default(this._element).hasClass("disabled")){var e=i.default(this._menu).hasClass("show");t._clearMenus(),e||this.show(!0)}},e.show=function(e){if(void 0===e&&(e=!1),!(this._element.disabled||i.default(this._element).hasClass("disabled")||i.default(this._menu).hasClass("show"))){var n={relatedTarget:this._element},o=i.default.Event("show.bs.dropdown",n),r=t._getParentFromElement(this._element);if(i.default(r).trigger(o),!o.isDefaultPrevented()){if(!this._inNavbar&&e){if("undefined"==typeof It)throw new TypeError("Bootstrap's dropdowns require Popper (https://popper.js.org)");var a=this._element;"parent"===this._config.reference?a=r:l.isElement(this._config.reference)&&(a=this._config.reference,"undefined"!=typeof this._config.reference.jquery&&(a=this._config.reference[0])),"scrollParent"!==this._config.boundary&&i.default(r).addClass("position-static"),this._popper=new It(a,this._menu,this._getPopperConfig())}"ontouchstart"in document.documentElement&&0===i.default(r).closest(".navbar-nav").length&&i.default(document.body).children().on("mouseover",null,i.default.noop),this._element.focus(),this._element.setAttribute("aria-expanded",!0),i.default(this._menu).toggleClass("show"),i.default(r).toggleClass("show").trigger(i.default.Event("shown.bs.dropdown",n))}}},e.hide=function(){if(!this._element.disabled&&!i.default(this._element).hasClass("disabled")&&i.default(this._menu).hasClass("show")){var e={relatedTarget:this._element},n=i.default.Event("hide.bs.dropdown",e),o=t._getParentFromElement(this._element);i.default(o).trigger(n),n.isDefaultPrevented()||(this._popper&&this._popper.destroy(),i.default(this._menu).toggleClass("show"),i.default(o).toggleClass("show").trigger(i.default.Event("hidden.bs.dropdown",e)))}},e.dispose=function(){i.default.removeData(this._element,"bs.dropdown"),i.default(this._element).off(".bs.dropdown"),this._element=null,this._menu=null,null!==this._popper&&(this._popper.destroy(),this._popper=null)},e.update=function(){this._inNavbar=this._detectNavbar(),null!==this._popper&&this._popper.scheduleUpdate()},e._addEventListeners=function(){var t=this;i.default(this._element).on("click.bs.dropdown",(function(e){e.preventDefault(),e.stopPropagation(),t.toggle()}))},e._getConfig=function(t){return t=a({},this.constructor.Default,i.default(this._element).data(),t),l.typeCheckConfig(Ot,t,this.constructor.DefaultType),t},e._getMenuElement=function(){if(!this._menu){var e=t._getParentFromElement(this._element);e&&(this._menu=e.querySelector(".dropdown-menu"))}return this._menu},e._getPlacement=function(){var t=i.default(this._element.parentNode),e="bottom-start";return t.hasClass("dropup")?e=i.default(this._menu).hasClass("dropdown-menu-right")?"top-end":"top-start":t.hasClass("dropright")?e="right-start":t.hasClass("dropleft")?e="left-start":i.default(this._menu).hasClass("dropdown-menu-right")&&(e="bottom-end"),e},e._detectNavbar=function(){return i.default(this._element).closest(".navbar").length>0},e._getOffset=function(){var t=this,e={};return"function"==typeof this._config.offset?e.fn=function(e){return e.offsets=a({},e.offsets,t._config.offset(e.offsets,t._element)||{}),e}:e.offset=this._config.offset,e},e._getPopperConfig=function(){var t={placement:this._getPlacement(),modifiers:{offset:this._getOffset(),flip:{enabled:this._config.flip},preventOverflow:{boundariesElement:this._config.boundary}}};return"static"===this._config.display&&(t.modifiers.applyStyle={enabled:!1}),a({},t,this._config.popperConfig)},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this).data("bs.dropdown");if(n||(n=new t(this,"object"==typeof e?e:null),i.default(this).data("bs.dropdown",n)),"string"==typeof e){if("undefined"==typeof n[e])throw new TypeError('No method named "'+e+'"');n[e]()}}))},t._clearMenus=function(e){if(!e||3!==e.which&&("keyup"!==e.type||9===e.which))for(var n=[].slice.call(document.querySelectorAll('[data-toggle="dropdown"]')),o=0,r=n.length;o0&&a--,40===e.which&&adocument.documentElement.clientHeight;n||(this._element.style.overflowY="hidden"),this._element.classList.add("modal-static");var o=l.getTransitionDurationFromElement(this._dialog);i.default(this._element).off(l.TRANSITION_END),i.default(this._element).one(l.TRANSITION_END,(function(){t._element.classList.remove("modal-static"),n||i.default(t._element).one(l.TRANSITION_END,(function(){t._element.style.overflowY=""})).emulateTransitionEnd(t._element,o)})).emulateTransitionEnd(o),this._element.focus()}},e._showElement=function(t){var e=this,n=i.default(this._element).hasClass("fade"),o=this._dialog?this._dialog.querySelector(".modal-body"):null;this._element.parentNode&&this._element.parentNode.nodeType===Node.ELEMENT_NODE||document.body.appendChild(this._element),this._element.style.display="block",this._element.removeAttribute("aria-hidden"),this._element.setAttribute("aria-modal",!0),this._element.setAttribute("role","dialog"),i.default(this._dialog).hasClass("modal-dialog-scrollable")&&o?o.scrollTop=0:this._element.scrollTop=0,n&&l.reflow(this._element),i.default(this._element).addClass("show"),this._config.focus&&this._enforceFocus();var r=i.default.Event("shown.bs.modal",{relatedTarget:t}),a=function(){e._config.focus&&e._element.focus(),e._isTransitioning=!1,i.default(e._element).trigger(r)};if(n){var s=l.getTransitionDurationFromElement(this._dialog);i.default(this._dialog).one(l.TRANSITION_END,a).emulateTransitionEnd(s)}else a()},e._enforceFocus=function(){var t=this;i.default(document).off("focusin.bs.modal").on("focusin.bs.modal",(function(e){document!==e.target&&t._element!==e.target&&0===i.default(t._element).has(e.target).length&&t._element.focus()}))},e._setEscapeEvent=function(){var t=this;this._isShown?i.default(this._element).on("keydown.dismiss.bs.modal",(function(e){t._config.keyboard&&27===e.which?(e.preventDefault(),t.hide()):t._config.keyboard||27!==e.which||t._triggerBackdropTransition()})):this._isShown||i.default(this._element).off("keydown.dismiss.bs.modal")},e._setResizeEvent=function(){var t=this;this._isShown?i.default(window).on("resize.bs.modal",(function(e){return t.handleUpdate(e)})):i.default(window).off("resize.bs.modal")},e._hideModal=function(){var t=this;this._element.style.display="none",this._element.setAttribute("aria-hidden",!0),this._element.removeAttribute("aria-modal"),this._element.removeAttribute("role"),this._isTransitioning=!1,this._showBackdrop((function(){i.default(document.body).removeClass("modal-open"),t._resetAdjustments(),t._resetScrollbar(),i.default(t._element).trigger("hidden.bs.modal")}))},e._removeBackdrop=function(){this._backdrop&&(i.default(this._backdrop).remove(),this._backdrop=null)},e._showBackdrop=function(t){var e=this,n=i.default(this._element).hasClass("fade")?"fade":"";if(this._isShown&&this._config.backdrop){if(this._backdrop=document.createElement("div"),this._backdrop.className="modal-backdrop",n&&this._backdrop.classList.add(n),i.default(this._backdrop).appendTo(document.body),i.default(this._element).on("click.dismiss.bs.modal",(function(t){e._ignoreBackdropClick?e._ignoreBackdropClick=!1:t.target===t.currentTarget&&("static"===e._config.backdrop?e._triggerBackdropTransition():e.hide())})),n&&l.reflow(this._backdrop),i.default(this._backdrop).addClass("show"),!t)return;if(!n)return void t();var o=l.getTransitionDurationFromElement(this._backdrop);i.default(this._backdrop).one(l.TRANSITION_END,t).emulateTransitionEnd(o)}else if(!this._isShown&&this._backdrop){i.default(this._backdrop).removeClass("show");var r=function(){e._removeBackdrop(),t&&t()};if(i.default(this._element).hasClass("fade")){var a=l.getTransitionDurationFromElement(this._backdrop);i.default(this._backdrop).one(l.TRANSITION_END,r).emulateTransitionEnd(a)}else r()}else t&&t()},e._adjustDialog=function(){var t=this._element.scrollHeight>document.documentElement.clientHeight;!this._isBodyOverflowing&&t&&(this._element.style.paddingLeft=this._scrollbarWidth+"px"),this._isBodyOverflowing&&!t&&(this._element.style.paddingRight=this._scrollbarWidth+"px")},e._resetAdjustments=function(){this._element.style.paddingLeft="",this._element.style.paddingRight=""},e._checkScrollbar=function(){var t=document.body.getBoundingClientRect();this._isBodyOverflowing=Math.round(t.left+t.right)
',trigger:"hover focus",title:"",delay:0,html:!1,selector:!1,placement:"top",offset:0,container:!1,fallbackPlacement:"flip",boundary:"scrollParent",customClass:"",sanitize:!0,sanitizeFn:null,whiteList:Qt,popperConfig:null},Zt={HIDE:"hide.bs.tooltip",HIDDEN:"hidden.bs.tooltip",SHOW:"show.bs.tooltip",SHOWN:"shown.bs.tooltip",INSERTED:"inserted.bs.tooltip",CLICK:"click.bs.tooltip",FOCUSIN:"focusin.bs.tooltip",FOCUSOUT:"focusout.bs.tooltip",MOUSEENTER:"mouseenter.bs.tooltip",MOUSELEAVE:"mouseleave.bs.tooltip"},te=function(){function t(t,e){if("undefined"==typeof It)throw new TypeError("Bootstrap's tooltips require Popper (https://popper.js.org)");this._isEnabled=!0,this._timeout=0,this._hoverState="",this._activeTrigger={},this._popper=null,this.element=t,this.config=this._getConfig(e),this.tip=null,this._setListeners()}var e=t.prototype;return e.enable=function(){this._isEnabled=!0},e.disable=function(){this._isEnabled=!1},e.toggleEnabled=function(){this._isEnabled=!this._isEnabled},e.toggle=function(t){if(this._isEnabled)if(t){var e=this.constructor.DATA_KEY,n=i.default(t.currentTarget).data(e);n||(n=new this.constructor(t.currentTarget,this._getDelegateConfig()),i.default(t.currentTarget).data(e,n)),n._activeTrigger.click=!n._activeTrigger.click,n._isWithActiveTrigger()?n._enter(null,n):n._leave(null,n)}else{if(i.default(this.getTipElement()).hasClass("show"))return void this._leave(null,this);this._enter(null,this)}},e.dispose=function(){clearTimeout(this._timeout),i.default.removeData(this.element,this.constructor.DATA_KEY),i.default(this.element).off(this.constructor.EVENT_KEY),i.default(this.element).closest(".modal").off("hide.bs.modal",this._hideModalHandler),this.tip&&i.default(this.tip).remove(),this._isEnabled=null,this._timeout=null,this._hoverState=null,this._activeTrigger=null,this._popper&&this._popper.destroy(),this._popper=null,this.element=null,this.config=null,this.tip=null},e.show=function(){var t=this;if("none"===i.default(this.element).css("display"))throw new Error("Please use show on visible elements");var e=i.default.Event(this.constructor.Event.SHOW);if(this.isWithContent()&&this._isEnabled){i.default(this.element).trigger(e);var n=l.findShadowRoot(this.element),o=i.default.contains(null!==n?n:this.element.ownerDocument.documentElement,this.element);if(e.isDefaultPrevented()||!o)return;var r=this.getTipElement(),a=l.getUID(this.constructor.NAME);r.setAttribute("id",a),this.element.setAttribute("aria-describedby",a),this.setContent(),this.config.animation&&i.default(r).addClass("fade");var s="function"==typeof this.config.placement?this.config.placement.call(this,r,this.element):this.config.placement,u=this._getAttachment(s);this.addAttachmentClass(u);var f=this._getContainer();i.default(r).data(this.constructor.DATA_KEY,this),i.default.contains(this.element.ownerDocument.documentElement,this.tip)||i.default(r).appendTo(f),i.default(this.element).trigger(this.constructor.Event.INSERTED),this._popper=new It(this.element,r,this._getPopperConfig(u)),i.default(r).addClass("show"),i.default(r).addClass(this.config.customClass),"ontouchstart"in document.documentElement&&i.default(document.body).children().on("mouseover",null,i.default.noop);var d=function(){t.config.animation&&t._fixTransition();var e=t._hoverState;t._hoverState=null,i.default(t.element).trigger(t.constructor.Event.SHOWN),"out"===e&&t._leave(null,t)};if(i.default(this.tip).hasClass("fade")){var c=l.getTransitionDurationFromElement(this.tip);i.default(this.tip).one(l.TRANSITION_END,d).emulateTransitionEnd(c)}else d()}},e.hide=function(t){var e=this,n=this.getTipElement(),o=i.default.Event(this.constructor.Event.HIDE),r=function(){"show"!==e._hoverState&&n.parentNode&&n.parentNode.removeChild(n),e._cleanTipClass(),e.element.removeAttribute("aria-describedby"),i.default(e.element).trigger(e.constructor.Event.HIDDEN),null!==e._popper&&e._popper.destroy(),t&&t()};if(i.default(this.element).trigger(o),!o.isDefaultPrevented()){if(i.default(n).removeClass("show"),"ontouchstart"in document.documentElement&&i.default(document.body).children().off("mouseover",null,i.default.noop),this._activeTrigger.click=!1,this._activeTrigger.focus=!1,this._activeTrigger.hover=!1,i.default(this.tip).hasClass("fade")){var a=l.getTransitionDurationFromElement(n);i.default(n).one(l.TRANSITION_END,r).emulateTransitionEnd(a)}else r();this._hoverState=""}},e.update=function(){null!==this._popper&&this._popper.scheduleUpdate()},e.isWithContent=function(){return Boolean(this.getTitle())},e.addAttachmentClass=function(t){i.default(this.getTipElement()).addClass("bs-tooltip-"+t)},e.getTipElement=function(){return this.tip=this.tip||i.default(this.config.template)[0],this.tip},e.setContent=function(){var t=this.getTipElement();this.setElementContent(i.default(t.querySelectorAll(".tooltip-inner")),this.getTitle()),i.default(t).removeClass("fade show")},e.setElementContent=function(t,e){"object"!=typeof e||!e.nodeType&&!e.jquery?this.config.html?(this.config.sanitize&&(e=Vt(e,this.config.whiteList,this.config.sanitizeFn)),t.html(e)):t.text(e):this.config.html?i.default(e).parent().is(t)||t.empty().append(e):t.text(i.default(e).text())},e.getTitle=function(){var t=this.element.getAttribute("data-original-title");return t||(t="function"==typeof this.config.title?this.config.title.call(this.element):this.config.title),t},e._getPopperConfig=function(t){var e=this;return a({},{placement:t,modifiers:{offset:this._getOffset(),flip:{behavior:this.config.fallbackPlacement},arrow:{element:".arrow"},preventOverflow:{boundariesElement:this.config.boundary}},onCreate:function(t){t.originalPlacement!==t.placement&&e._handlePopperPlacementChange(t)},onUpdate:function(t){return e._handlePopperPlacementChange(t)}},this.config.popperConfig)},e._getOffset=function(){var t=this,e={};return"function"==typeof this.config.offset?e.fn=function(e){return e.offsets=a({},e.offsets,t.config.offset(e.offsets,t.element)||{}),e}:e.offset=this.config.offset,e},e._getContainer=function(){return!1===this.config.container?document.body:l.isElement(this.config.container)?i.default(this.config.container):i.default(document).find(this.config.container)},e._getAttachment=function(t){return $t[t.toUpperCase()]},e._setListeners=function(){var t=this;this.config.trigger.split(" ").forEach((function(e){if("click"===e)i.default(t.element).on(t.constructor.Event.CLICK,t.config.selector,(function(e){return t.toggle(e)}));else if("manual"!==e){var n="hover"===e?t.constructor.Event.MOUSEENTER:t.constructor.Event.FOCUSIN,o="hover"===e?t.constructor.Event.MOUSELEAVE:t.constructor.Event.FOCUSOUT;i.default(t.element).on(n,t.config.selector,(function(e){return t._enter(e)})).on(o,t.config.selector,(function(e){return t._leave(e)}))}})),this._hideModalHandler=function(){t.element&&t.hide()},i.default(this.element).closest(".modal").on("hide.bs.modal",this._hideModalHandler),this.config.selector?this.config=a({},this.config,{trigger:"manual",selector:""}):this._fixTitle()},e._fixTitle=function(){var t=typeof this.element.getAttribute("data-original-title");(this.element.getAttribute("title")||"string"!==t)&&(this.element.setAttribute("data-original-title",this.element.getAttribute("title")||""),this.element.setAttribute("title",""))},e._enter=function(t,e){var n=this.constructor.DATA_KEY;(e=e||i.default(t.currentTarget).data(n))||(e=new this.constructor(t.currentTarget,this._getDelegateConfig()),i.default(t.currentTarget).data(n,e)),t&&(e._activeTrigger["focusin"===t.type?"focus":"hover"]=!0),i.default(e.getTipElement()).hasClass("show")||"show"===e._hoverState?e._hoverState="show":(clearTimeout(e._timeout),e._hoverState="show",e.config.delay&&e.config.delay.show?e._timeout=setTimeout((function(){"show"===e._hoverState&&e.show()}),e.config.delay.show):e.show())},e._leave=function(t,e){var n=this.constructor.DATA_KEY;(e=e||i.default(t.currentTarget).data(n))||(e=new this.constructor(t.currentTarget,this._getDelegateConfig()),i.default(t.currentTarget).data(n,e)),t&&(e._activeTrigger["focusout"===t.type?"focus":"hover"]=!1),e._isWithActiveTrigger()||(clearTimeout(e._timeout),e._hoverState="out",e.config.delay&&e.config.delay.hide?e._timeout=setTimeout((function(){"out"===e._hoverState&&e.hide()}),e.config.delay.hide):e.hide())},e._isWithActiveTrigger=function(){for(var t in this._activeTrigger)if(this._activeTrigger[t])return!0;return!1},e._getConfig=function(t){var e=i.default(this.element).data();return Object.keys(e).forEach((function(t){-1!==Kt.indexOf(t)&&delete e[t]})),"number"==typeof(t=a({},this.constructor.Default,e,"object"==typeof t&&t?t:{})).delay&&(t.delay={show:t.delay,hide:t.delay}),"number"==typeof t.title&&(t.title=t.title.toString()),"number"==typeof t.content&&(t.content=t.content.toString()),l.typeCheckConfig(Yt,t,this.constructor.DefaultType),t.sanitize&&(t.template=Vt(t.template,t.whiteList,t.sanitizeFn)),t},e._getDelegateConfig=function(){var t={};if(this.config)for(var e in this.config)this.constructor.Default[e]!==this.config[e]&&(t[e]=this.config[e]);return t},e._cleanTipClass=function(){var t=i.default(this.getTipElement()),e=t.attr("class").match(Xt);null!==e&&e.length&&t.removeClass(e.join(""))},e._handlePopperPlacementChange=function(t){this.tip=t.instance.popper,this._cleanTipClass(),this.addAttachmentClass(this._getAttachment(t.placement))},e._fixTransition=function(){var t=this.getTipElement(),e=this.config.animation;null===t.getAttribute("x-placement")&&(i.default(t).removeClass("fade"),this.config.animation=!1,this.hide(),this.show(),this.config.animation=e)},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this),o=n.data("bs.tooltip"),r="object"==typeof e&&e;if((o||!/dispose|hide/.test(e))&&(o||(o=new t(this,r),n.data("bs.tooltip",o)),"string"==typeof e)){if("undefined"==typeof o[e])throw new TypeError('No method named "'+e+'"');o[e]()}}))},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}},{key:"Default",get:function(){return Jt}},{key:"NAME",get:function(){return Yt}},{key:"DATA_KEY",get:function(){return"bs.tooltip"}},{key:"Event",get:function(){return Zt}},{key:"EVENT_KEY",get:function(){return".bs.tooltip"}},{key:"DefaultType",get:function(){return Gt}}]),t}();i.default.fn[Yt]=te._jQueryInterface,i.default.fn[Yt].Constructor=te,i.default.fn[Yt].noConflict=function(){return i.default.fn[Yt]=zt,te._jQueryInterface};var ee="popover",ne=i.default.fn[ee],ie=new RegExp("(^|\\s)bs-popover\\S+","g"),oe=a({},te.Default,{placement:"right",trigger:"click",content:"",template:''}),re=a({},te.DefaultType,{content:"(string|element|function)"}),ae={HIDE:"hide.bs.popover",HIDDEN:"hidden.bs.popover",SHOW:"show.bs.popover",SHOWN:"shown.bs.popover",INSERTED:"inserted.bs.popover",CLICK:"click.bs.popover",FOCUSIN:"focusin.bs.popover",FOCUSOUT:"focusout.bs.popover",MOUSEENTER:"mouseenter.bs.popover",MOUSELEAVE:"mouseleave.bs.popover"},se=function(t){var e,n;function o(){return t.apply(this,arguments)||this}n=t,(e=o).prototype=Object.create(n.prototype),e.prototype.constructor=e,e.__proto__=n;var a=o.prototype;return a.isWithContent=function(){return this.getTitle()||this._getContent()},a.addAttachmentClass=function(t){i.default(this.getTipElement()).addClass("bs-popover-"+t)},a.getTipElement=function(){return this.tip=this.tip||i.default(this.config.template)[0],this.tip},a.setContent=function(){var t=i.default(this.getTipElement());this.setElementContent(t.find(".popover-header"),this.getTitle());var e=this._getContent();"function"==typeof e&&(e=e.call(this.element)),this.setElementContent(t.find(".popover-body"),e),t.removeClass("fade show")},a._getContent=function(){return this.element.getAttribute("data-content")||this.config.content},a._cleanTipClass=function(){var t=i.default(this.getTipElement()),e=t.attr("class").match(ie);null!==e&&e.length>0&&t.removeClass(e.join(""))},o._jQueryInterface=function(t){return this.each((function(){var e=i.default(this).data("bs.popover"),n="object"==typeof t?t:null;if((e||!/dispose|hide/.test(t))&&(e||(e=new o(this,n),i.default(this).data("bs.popover",e)),"string"==typeof t)){if("undefined"==typeof e[t])throw new TypeError('No method named "'+t+'"');e[t]()}}))},r(o,null,[{key:"VERSION",get:function(){return"4.6.0"}},{key:"Default",get:function(){return oe}},{key:"NAME",get:function(){return ee}},{key:"DATA_KEY",get:function(){return"bs.popover"}},{key:"Event",get:function(){return ae}},{key:"EVENT_KEY",get:function(){return".bs.popover"}},{key:"DefaultType",get:function(){return re}}]),o}(te);i.default.fn[ee]=se._jQueryInterface,i.default.fn[ee].Constructor=se,i.default.fn[ee].noConflict=function(){return i.default.fn[ee]=ne,se._jQueryInterface};var le="scrollspy",ue=i.default.fn[le],fe={offset:10,method:"auto",target:""},de={offset:"number",method:"string",target:"(string|element)"},ce=function(){function t(t,e){var n=this;this._element=t,this._scrollElement="BODY"===t.tagName?window:t,this._config=this._getConfig(e),this._selector=this._config.target+" .nav-link,"+this._config.target+" .list-group-item,"+this._config.target+" .dropdown-item",this._offsets=[],this._targets=[],this._activeTarget=null,this._scrollHeight=0,i.default(this._scrollElement).on("scroll.bs.scrollspy",(function(t){return n._process(t)})),this.refresh(),this._process()}var e=t.prototype;return e.refresh=function(){var t=this,e=this._scrollElement===this._scrollElement.window?"offset":"position",n="auto"===this._config.method?e:this._config.method,o="position"===n?this._getScrollTop():0;this._offsets=[],this._targets=[],this._scrollHeight=this._getScrollHeight(),[].slice.call(document.querySelectorAll(this._selector)).map((function(t){var e,r=l.getSelectorFromElement(t);if(r&&(e=document.querySelector(r)),e){var a=e.getBoundingClientRect();if(a.width||a.height)return[i.default(e)[n]().top+o,r]}return null})).filter((function(t){return t})).sort((function(t,e){return t[0]-e[0]})).forEach((function(e){t._offsets.push(e[0]),t._targets.push(e[1])}))},e.dispose=function(){i.default.removeData(this._element,"bs.scrollspy"),i.default(this._scrollElement).off(".bs.scrollspy"),this._element=null,this._scrollElement=null,this._config=null,this._selector=null,this._offsets=null,this._targets=null,this._activeTarget=null,this._scrollHeight=null},e._getConfig=function(t){if("string"!=typeof(t=a({},fe,"object"==typeof t&&t?t:{})).target&&l.isElement(t.target)){var e=i.default(t.target).attr("id");e||(e=l.getUID(le),i.default(t.target).attr("id",e)),t.target="#"+e}return l.typeCheckConfig(le,t,de),t},e._getScrollTop=function(){return this._scrollElement===window?this._scrollElement.pageYOffset:this._scrollElement.scrollTop},e._getScrollHeight=function(){return this._scrollElement.scrollHeight||Math.max(document.body.scrollHeight,document.documentElement.scrollHeight)},e._getOffsetHeight=function(){return this._scrollElement===window?window.innerHeight:this._scrollElement.getBoundingClientRect().height},e._process=function(){var t=this._getScrollTop()+this._config.offset,e=this._getScrollHeight(),n=this._config.offset+e-this._getOffsetHeight();if(this._scrollHeight!==e&&this.refresh(),t>=n){var i=this._targets[this._targets.length-1];this._activeTarget!==i&&this._activate(i)}else{if(this._activeTarget&&t0)return this._activeTarget=null,void this._clear();for(var o=this._offsets.length;o--;){this._activeTarget!==this._targets[o]&&t>=this._offsets[o]&&("undefined"==typeof this._offsets[o+1]||t li > .active":".active";n=(n=i.default.makeArray(i.default(o).find(a)))[n.length-1]}var s=i.default.Event("hide.bs.tab",{relatedTarget:this._element}),u=i.default.Event("show.bs.tab",{relatedTarget:n});if(n&&i.default(n).trigger(s),i.default(this._element).trigger(u),!u.isDefaultPrevented()&&!s.isDefaultPrevented()){r&&(e=document.querySelector(r)),this._activate(this._element,o);var f=function(){var e=i.default.Event("hidden.bs.tab",{relatedTarget:t._element}),o=i.default.Event("shown.bs.tab",{relatedTarget:n});i.default(n).trigger(e),i.default(t._element).trigger(o)};e?this._activate(e,e.parentNode,f):f()}}},e.dispose=function(){i.default.removeData(this._element,"bs.tab"),this._element=null},e._activate=function(t,e,n){var o=this,r=(!e||"UL"!==e.nodeName&&"OL"!==e.nodeName?i.default(e).children(".active"):i.default(e).find("> li > .active"))[0],a=n&&r&&i.default(r).hasClass("fade"),s=function(){return o._transitionComplete(t,r,n)};if(r&&a){var u=l.getTransitionDurationFromElement(r);i.default(r).removeClass("show").one(l.TRANSITION_END,s).emulateTransitionEnd(u)}else s()},e._transitionComplete=function(t,e,n){if(e){i.default(e).removeClass("active");var o=i.default(e.parentNode).find("> .dropdown-menu .active")[0];o&&i.default(o).removeClass("active"),"tab"===e.getAttribute("role")&&e.setAttribute("aria-selected",!1)}if(i.default(t).addClass("active"),"tab"===t.getAttribute("role")&&t.setAttribute("aria-selected",!0),l.reflow(t),t.classList.contains("fade")&&t.classList.add("show"),t.parentNode&&i.default(t.parentNode).hasClass("dropdown-menu")){var r=i.default(t).closest(".dropdown")[0];if(r){var a=[].slice.call(r.querySelectorAll(".dropdown-toggle"));i.default(a).addClass("active")}t.setAttribute("aria-expanded",!0)}n&&n()},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this),o=n.data("bs.tab");if(o||(o=new t(this),n.data("bs.tab",o)),"string"==typeof e){if("undefined"==typeof o[e])throw new TypeError('No method named "'+e+'"');o[e]()}}))},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}}]),t}();i.default(document).on("click.bs.tab.data-api",'[data-toggle="tab"], [data-toggle="pill"], [data-toggle="list"]',(function(t){t.preventDefault(),pe._jQueryInterface.call(i.default(this),"show")})),i.default.fn.tab=pe._jQueryInterface,i.default.fn.tab.Constructor=pe,i.default.fn.tab.noConflict=function(){return i.default.fn.tab=he,pe._jQueryInterface};var me=i.default.fn.toast,ge={animation:"boolean",autohide:"boolean",delay:"number"},ve={animation:!0,autohide:!0,delay:500},_e=function(){function t(t,e){this._element=t,this._config=this._getConfig(e),this._timeout=null,this._setListeners()}var e=t.prototype;return e.show=function(){var t=this,e=i.default.Event("show.bs.toast");if(i.default(this._element).trigger(e),!e.isDefaultPrevented()){this._clearTimeout(),this._config.animation&&this._element.classList.add("fade");var n=function(){t._element.classList.remove("showing"),t._element.classList.add("show"),i.default(t._element).trigger("shown.bs.toast"),t._config.autohide&&(t._timeout=setTimeout((function(){t.hide()}),t._config.delay))};if(this._element.classList.remove("hide"),l.reflow(this._element),this._element.classList.add("showing"),this._config.animation){var o=l.getTransitionDurationFromElement(this._element);i.default(this._element).one(l.TRANSITION_END,n).emulateTransitionEnd(o)}else n()}},e.hide=function(){if(this._element.classList.contains("show")){var t=i.default.Event("hide.bs.toast");i.default(this._element).trigger(t),t.isDefaultPrevented()||this._close()}},e.dispose=function(){this._clearTimeout(),this._element.classList.contains("show")&&this._element.classList.remove("show"),i.default(this._element).off("click.dismiss.bs.toast"),i.default.removeData(this._element,"bs.toast"),this._element=null,this._config=null},e._getConfig=function(t){return t=a({},ve,i.default(this._element).data(),"object"==typeof t&&t?t:{}),l.typeCheckConfig("toast",t,this.constructor.DefaultType),t},e._setListeners=function(){var t=this;i.default(this._element).on("click.dismiss.bs.toast",'[data-dismiss="toast"]',(function(){return t.hide()}))},e._close=function(){var t=this,e=function(){t._element.classList.add("hide"),i.default(t._element).trigger("hidden.bs.toast")};if(this._element.classList.remove("show"),this._config.animation){var n=l.getTransitionDurationFromElement(this._element);i.default(this._element).one(l.TRANSITION_END,e).emulateTransitionEnd(n)}else e()},e._clearTimeout=function(){clearTimeout(this._timeout),this._timeout=null},t._jQueryInterface=function(e){return this.each((function(){var n=i.default(this),o=n.data("bs.toast");if(o||(o=new t(this,"object"==typeof e&&e),n.data("bs.toast",o)),"string"==typeof e){if("undefined"==typeof o[e])throw new TypeError('No method named "'+e+'"');o[e](this)}}))},r(t,null,[{key:"VERSION",get:function(){return"4.6.0"}},{key:"DefaultType",get:function(){return ge}},{key:"Default",get:function(){return ve}}]),t}();i.default.fn.toast=_e._jQueryInterface,i.default.fn.toast.Constructor=_e,i.default.fn.toast.noConflict=function(){return i.default.fn.toast=me,_e._jQueryInterface},t.Alert=d,t.Button=h,t.Carousel=y,t.Collapse=S,t.Dropdown=Ft,t.Modal=qt,t.Popover=se,t.Scrollspy=ce,t.Tab=pe,t.Toast=_e,t.Tooltip=te,t.Util=l,Object.defineProperty(t,"__esModule",{value:!0})})); diff --git a/js/_vendor/medium-zoom.esm.js b/js/_vendor/medium-zoom.esm.js new file mode 100644 index 00000000..fa2b3776 --- /dev/null +++ b/js/_vendor/medium-zoom.esm.js @@ -0,0 +1,622 @@ +/*! medium-zoom 1.0.6 | MIT License | https://github.com/francoischalifour/medium-zoom */ +var _extends = Object.assign || function (target) { + for (var i = 1; i < arguments.length; i++) { + var source = arguments[i]; + + for (var key in source) { + if (Object.prototype.hasOwnProperty.call(source, key)) { + target[key] = source[key]; + } + } + } + + return target; +}; + +var isSupported = function isSupported(node) { + return node.tagName === 'IMG'; +}; + +/* eslint-disable-next-line no-prototype-builtins */ +var isNodeList = function isNodeList(selector) { + return NodeList.prototype.isPrototypeOf(selector); +}; + +var isNode = function isNode(selector) { + return selector && selector.nodeType === 1; +}; + +var isSvg = function isSvg(image) { + var source = image.currentSrc || image.src; + return source.substr(-4).toLowerCase() === '.svg'; +}; + +var getImagesFromSelector = function getImagesFromSelector(selector) { + try { + if (Array.isArray(selector)) { + return selector.filter(isSupported); + } + + if (isNodeList(selector)) { + // Do not use spread operator or Array.from() for IE support + return [].slice.call(selector).filter(isSupported); + } + + if (isNode(selector)) { + return [selector].filter(isSupported); + } + + if (typeof selector === 'string') { + // Do not use spread operator or Array.from() for IE support + return [].slice.call(document.querySelectorAll(selector)).filter(isSupported); + } + + return []; + } catch (err) { + throw new TypeError('The provided selector is invalid.\n' + 'Expects a CSS selector, a Node element, a NodeList or an array.\n' + 'See: https://github.com/francoischalifour/medium-zoom'); + } +}; + +var createOverlay = function createOverlay(background) { + var overlay = document.createElement('div'); + overlay.classList.add('medium-zoom-overlay'); + overlay.style.background = background; + + return overlay; +}; + +var cloneTarget = function cloneTarget(template) { + var _template$getBounding = template.getBoundingClientRect(), + top = _template$getBounding.top, + left = _template$getBounding.left, + width = _template$getBounding.width, + height = _template$getBounding.height; + + var clone = template.cloneNode(); + var scrollTop = window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; + var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft || document.body.scrollLeft || 0; + + clone.removeAttribute('id'); + clone.style.position = 'absolute'; + clone.style.top = top + scrollTop + 'px'; + clone.style.left = left + scrollLeft + 'px'; + clone.style.width = width + 'px'; + clone.style.height = height + 'px'; + clone.style.transform = ''; + + return clone; +}; + +var createCustomEvent = function createCustomEvent(type, params) { + var eventParams = _extends({ + bubbles: false, + cancelable: false, + detail: undefined + }, params); + + if (typeof window.CustomEvent === 'function') { + return new CustomEvent(type, eventParams); + } + + var customEvent = document.createEvent('CustomEvent'); + customEvent.initCustomEvent(type, eventParams.bubbles, eventParams.cancelable, eventParams.detail); + + return customEvent; +}; + +var mediumZoomEsm = function mediumZoom(selector) { + var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}; + + /** + * Ensure the compatibility with IE11 if no Promise polyfill are used. + */ + var Promise = window.Promise || function Promise(fn) { + function noop() {} + fn(noop, noop); + }; + + var _handleClick = function _handleClick(event) { + var target = event.target; + + + if (target === overlay) { + close(); + return; + } + + if (images.indexOf(target) === -1) { + return; + } + + toggle({ target: target }); + }; + + var _handleScroll = function _handleScroll() { + if (isAnimating || !active.original) { + return; + } + + var currentScroll = window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; + + if (Math.abs(scrollTop - currentScroll) > zoomOptions.scrollOffset) { + setTimeout(close, 150); + } + }; + + var _handleKeyUp = function _handleKeyUp(event) { + var key = event.key || event.keyCode; + + // Close if escape key is pressed + if (key === 'Escape' || key === 'Esc' || key === 27) { + close(); + } + }; + + var update = function update() { + var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; + + var newOptions = options; + + if (options.background) { + overlay.style.background = options.background; + } + + if (options.container && options.container instanceof Object) { + newOptions.container = _extends({}, zoomOptions.container, options.container); + } + + if (options.template) { + var template = isNode(options.template) ? options.template : document.querySelector(options.template); + + newOptions.template = template; + } + + zoomOptions = _extends({}, zoomOptions, newOptions); + + images.forEach(function (image) { + image.dispatchEvent(createCustomEvent('medium-zoom:update', { + detail: { zoom: zoom } + })); + }); + + return zoom; + }; + + var clone = function clone() { + var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; + return mediumZoomEsm(_extends({}, zoomOptions, options)); + }; + + var attach = function attach() { + for (var _len = arguments.length, selectors = Array(_len), _key = 0; _key < _len; _key++) { + selectors[_key] = arguments[_key]; + } + + var newImages = selectors.reduce(function (imagesAccumulator, currentSelector) { + return [].concat(imagesAccumulator, getImagesFromSelector(currentSelector)); + }, []); + + newImages.filter(function (newImage) { + return images.indexOf(newImage) === -1; + }).forEach(function (newImage) { + images.push(newImage); + newImage.classList.add('medium-zoom-image'); + }); + + eventListeners.forEach(function (_ref) { + var type = _ref.type, + listener = _ref.listener, + options = _ref.options; + + newImages.forEach(function (image) { + image.addEventListener(type, listener, options); + }); + }); + + return zoom; + }; + + var detach = function detach() { + for (var _len2 = arguments.length, selectors = Array(_len2), _key2 = 0; _key2 < _len2; _key2++) { + selectors[_key2] = arguments[_key2]; + } + + if (active.zoomed) { + close(); + } + + var imagesToDetach = selectors.length > 0 ? selectors.reduce(function (imagesAccumulator, currentSelector) { + return [].concat(imagesAccumulator, getImagesFromSelector(currentSelector)); + }, []) : images; + + imagesToDetach.forEach(function (image) { + image.classList.remove('medium-zoom-image'); + image.dispatchEvent(createCustomEvent('medium-zoom:detach', { + detail: { zoom: zoom } + })); + }); + + images = images.filter(function (image) { + return imagesToDetach.indexOf(image) === -1; + }); + + return zoom; + }; + + var on = function on(type, listener) { + var options = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {}; + + images.forEach(function (image) { + image.addEventListener('medium-zoom:' + type, listener, options); + }); + + eventListeners.push({ type: 'medium-zoom:' + type, listener: listener, options: options }); + + return zoom; + }; + + var off = function off(type, listener) { + var options = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {}; + + images.forEach(function (image) { + image.removeEventListener('medium-zoom:' + type, listener, options); + }); + + eventListeners = eventListeners.filter(function (eventListener) { + return !(eventListener.type === 'medium-zoom:' + type && eventListener.listener.toString() === listener.toString()); + }); + + return zoom; + }; + + var open = function open() { + var _ref2 = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}, + target = _ref2.target; + + var _animate = function _animate() { + var container = { + width: document.documentElement.clientWidth, + height: document.documentElement.clientHeight, + left: 0, + top: 0, + right: 0, + bottom: 0 + }; + var viewportWidth = void 0; + var viewportHeight = void 0; + + if (zoomOptions.container) { + if (zoomOptions.container instanceof Object) { + // The container is given as an object with properties like width, height, left, top + container = _extends({}, container, zoomOptions.container); + + // We need to adjust custom options like container.right or container.bottom + viewportWidth = container.width - container.left - container.right - zoomOptions.margin * 2; + viewportHeight = container.height - container.top - container.bottom - zoomOptions.margin * 2; + } else { + // The container is given as an element + var zoomContainer = isNode(zoomOptions.container) ? zoomOptions.container : document.querySelector(zoomOptions.container); + + var _zoomContainer$getBou = zoomContainer.getBoundingClientRect(), + _width = _zoomContainer$getBou.width, + _height = _zoomContainer$getBou.height, + _left = _zoomContainer$getBou.left, + _top = _zoomContainer$getBou.top; + + container = _extends({}, container, { + width: _width, + height: _height, + left: _left, + top: _top + }); + } + } + + viewportWidth = viewportWidth || container.width - zoomOptions.margin * 2; + viewportHeight = viewportHeight || container.height - zoomOptions.margin * 2; + + var zoomTarget = active.zoomedHd || active.original; + var naturalWidth = isSvg(zoomTarget) ? viewportWidth : zoomTarget.naturalWidth || viewportWidth; + var naturalHeight = isSvg(zoomTarget) ? viewportHeight : zoomTarget.naturalHeight || viewportHeight; + + var _zoomTarget$getBoundi = zoomTarget.getBoundingClientRect(), + top = _zoomTarget$getBoundi.top, + left = _zoomTarget$getBoundi.left, + width = _zoomTarget$getBoundi.width, + height = _zoomTarget$getBoundi.height; + + var scaleX = Math.min(naturalWidth, viewportWidth) / width; + var scaleY = Math.min(naturalHeight, viewportHeight) / height; + var scale = Math.min(scaleX, scaleY); + var translateX = (-left + (viewportWidth - width) / 2 + zoomOptions.margin + container.left) / scale; + var translateY = (-top + (viewportHeight - height) / 2 + zoomOptions.margin + container.top) / scale; + var transform = 'scale(' + scale + ') translate3d(' + translateX + 'px, ' + translateY + 'px, 0)'; + + active.zoomed.style.transform = transform; + + if (active.zoomedHd) { + active.zoomedHd.style.transform = transform; + } + }; + + return new Promise(function (resolve) { + if (target && images.indexOf(target) === -1) { + resolve(zoom); + return; + } + + var _handleOpenEnd = function _handleOpenEnd() { + isAnimating = false; + active.zoomed.removeEventListener('transitionend', _handleOpenEnd); + active.original.dispatchEvent(createCustomEvent('medium-zoom:opened', { + detail: { zoom: zoom } + })); + + resolve(zoom); + }; + + if (active.zoomed) { + resolve(zoom); + return; + } + + if (target) { + // The zoom was triggered manually via a click + active.original = target; + } else if (images.length > 0) { +var _images = images; + active.original = _images[0]; + } else { + resolve(zoom); + return; + } + + active.original.dispatchEvent(createCustomEvent('medium-zoom:open', { + detail: { zoom: zoom } + })); + + scrollTop = window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; + isAnimating = true; + active.zoomed = cloneTarget(active.original); + + document.body.appendChild(overlay); + + if (zoomOptions.template) { + var template = isNode(zoomOptions.template) ? zoomOptions.template : document.querySelector(zoomOptions.template); + active.template = document.createElement('div'); + active.template.appendChild(template.content.cloneNode(true)); + + document.body.appendChild(active.template); + } + + document.body.appendChild(active.zoomed); + + window.requestAnimationFrame(function () { + document.body.classList.add('medium-zoom--opened'); + }); + + active.original.classList.add('medium-zoom-image--hidden'); + active.zoomed.classList.add('medium-zoom-image--opened'); + + active.zoomed.addEventListener('click', close); + active.zoomed.addEventListener('transitionend', _handleOpenEnd); + + if (active.original.getAttribute('data-zoom-src')) { + active.zoomedHd = active.zoomed.cloneNode(); + + // Reset the `scrset` property or the HD image won't load. + active.zoomedHd.removeAttribute('srcset'); + active.zoomedHd.removeAttribute('sizes'); + + active.zoomedHd.src = active.zoomed.getAttribute('data-zoom-src'); + + active.zoomedHd.onerror = function () { + clearInterval(getZoomTargetSize); + console.warn('Unable to reach the zoom image target ' + active.zoomedHd.src); + active.zoomedHd = null; + _animate(); + }; + + // We need to access the natural size of the full HD + // target as fast as possible to compute the animation. + var getZoomTargetSize = setInterval(function () { + if ( active.zoomedHd.complete) { + clearInterval(getZoomTargetSize); + active.zoomedHd.classList.add('medium-zoom-image--opened'); + active.zoomedHd.addEventListener('click', close); + document.body.appendChild(active.zoomedHd); + _animate(); + } + }, 10); + } else if (active.original.hasAttribute('srcset')) { + // If an image has a `srcset` attribuet, we don't know the dimensions of the + // zoomed (HD) image (like when `data-zoom-src` is specified). + // Therefore the approach is quite similar. + active.zoomedHd = active.zoomed.cloneNode(); + + // Resetting the sizes attribute tells the browser to load the + // image best fitting the current viewport size, respecting the `srcset`. + active.zoomedHd.removeAttribute('sizes'); + + // In Firefox, the `loading` attribute needs to be set to `eager` (default + // value) for the load event to be fired. + active.zoomedHd.removeAttribute('loading'); + + // Wait for the load event of the hd image. This will fire if the image + // is already cached. + var loadEventListener = active.zoomedHd.addEventListener('load', function () { + active.zoomedHd.removeEventListener('load', loadEventListener); + active.zoomedHd.classList.add('medium-zoom-image--opened'); + active.zoomedHd.addEventListener('click', close); + document.body.appendChild(active.zoomedHd); + _animate(); + }); + } else { + _animate(); + } + }); + }; + + var close = function close() { + return new Promise(function (resolve) { + if (isAnimating || !active.original) { + resolve(zoom); + return; + } + + var _handleCloseEnd = function _handleCloseEnd() { + active.original.classList.remove('medium-zoom-image--hidden'); + document.body.removeChild(active.zoomed); + if (active.zoomedHd) { + document.body.removeChild(active.zoomedHd); + } + document.body.removeChild(overlay); + active.zoomed.classList.remove('medium-zoom-image--opened'); + if (active.template) { + document.body.removeChild(active.template); + } + + isAnimating = false; + active.zoomed.removeEventListener('transitionend', _handleCloseEnd); + + active.original.dispatchEvent(createCustomEvent('medium-zoom:closed', { + detail: { zoom: zoom } + })); + + active.original = null; + active.zoomed = null; + active.zoomedHd = null; + active.template = null; + + resolve(zoom); + }; + + isAnimating = true; + document.body.classList.remove('medium-zoom--opened'); + active.zoomed.style.transform = ''; + + if (active.zoomedHd) { + active.zoomedHd.style.transform = ''; + } + + // Fade out the template so it's not too abrupt + if (active.template) { + active.template.style.transition = 'opacity 150ms'; + active.template.style.opacity = 0; + } + + active.original.dispatchEvent(createCustomEvent('medium-zoom:close', { + detail: { zoom: zoom } + })); + + active.zoomed.addEventListener('transitionend', _handleCloseEnd); + }); + }; + + var toggle = function toggle() { + var _ref3 = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}, + target = _ref3.target; + + if (active.original) { + return close(); + } + + return open({ target: target }); + }; + + var getOptions = function getOptions() { + return zoomOptions; + }; + + var getImages = function getImages() { + return images; + }; + + var getZoomedImage = function getZoomedImage() { + return active.original; + }; + + var images = []; + var eventListeners = []; + var isAnimating = false; + var scrollTop = 0; + var zoomOptions = options; + var active = { + original: null, + zoomed: null, + zoomedHd: null, + template: null + + // If the selector is omitted, it's replaced by the options + };if (Object.prototype.toString.call(selector) === '[object Object]') { + zoomOptions = selector; + } else if (selector || typeof selector === 'string' // to process empty string as a selector + ) { + attach(selector); + } + + // Apply the default option values + zoomOptions = _extends({ + margin: 0, + background: '#fff', + scrollOffset: 40, + container: null, + template: null + }, zoomOptions); + + var overlay = createOverlay(zoomOptions.background); + + document.addEventListener('click', _handleClick); + document.addEventListener('keyup', _handleKeyUp); + document.addEventListener('scroll', _handleScroll); + window.addEventListener('resize', close); + + var zoom = { + open: open, + close: close, + toggle: toggle, + update: update, + clone: clone, + attach: attach, + detach: detach, + on: on, + off: off, + getOptions: getOptions, + getImages: getImages, + getZoomedImage: getZoomedImage + }; + + return zoom; +}; + +function styleInject(css, ref) { + if ( ref === void 0 ) ref = {}; + var insertAt = ref.insertAt; + + if (!css || typeof document === 'undefined') { return; } + + var head = document.head || document.getElementsByTagName('head')[0]; + var style = document.createElement('style'); + style.type = 'text/css'; + + if (insertAt === 'top') { + if (head.firstChild) { + head.insertBefore(style, head.firstChild); + } else { + head.appendChild(style); + } + } else { + head.appendChild(style); + } + + if (style.styleSheet) { + style.styleSheet.cssText = css; + } else { + style.appendChild(document.createTextNode(css)); + } +} + +var css = ".medium-zoom-overlay{position:fixed;top:0;right:0;bottom:0;left:0;opacity:0;transition:opacity .3s;will-change:opacity}.medium-zoom--opened .medium-zoom-overlay{cursor:pointer;cursor:zoom-out;opacity:1}.medium-zoom-image{cursor:pointer;cursor:zoom-in;transition:transform .3s cubic-bezier(.2,0,.2,1)!important}.medium-zoom-image--hidden{visibility:hidden}.medium-zoom-image--opened{position:relative;cursor:pointer;cursor:zoom-out;will-change:transform}"; +styleInject(css); + +export default mediumZoomEsm; diff --git a/js/algolia-search.js b/js/algolia-search.js new file mode 100644 index 00000000..949d87bc --- /dev/null +++ b/js/algolia-search.js @@ -0,0 +1,74 @@ +/************************************************* + * Wowchemy + * https://github.com/wowchemy/wowchemy-hugo-modules + * + * Algolia based search algorithm. + **************************************************/ + +import {algoliaConfig, i18n, content_type} from '@params'; + +function getTemplate(templateName) { + return document.querySelector(`#${templateName}-template`).innerHTML; +} + +if (typeof instantsearch === 'function' && $('#search-box').length) { + const options = { + appId: algoliaConfig.appId, + apiKey: algoliaConfig.apiKey, + indexName: algoliaConfig.indexName, + routing: true, + searchParameters: { + hitsPerPage: 10, + }, + searchFunction: function (helper) { + let searchResults = document.querySelector('#search-hits'); + if (helper.state.query === '') { + searchResults.style.display = 'none'; + return; + } + helper.search(); + searchResults.style.display = 'block'; + }, + }; + + const search = instantsearch(options); + + // Initialize search box. + search.addWidget( + instantsearch.widgets.searchBox({ + container: '#search-box', + autofocus: false, + reset: true, + poweredBy: algoliaConfig.poweredBy, + placeholder: i18n.placeholder, + }), + ); + + // Initialize search results. + search.addWidget( + instantsearch.widgets.infiniteHits({ + container: '#search-hits', + escapeHits: true, + templates: { + empty: '
' + i18n.no_results + '
', + item: getTemplate('search-hit-algolia'), + }, + cssClasses: { + showmoreButton: 'btn btn-outline-primary', + }, + }), + ); + + // On render search results, localize the content type metadata. + search.on('render', function () { + $('.search-hit-type').each(function () { + let content_key = $(this).text(); + if (content_key in content_type) { + $(this).text(content_type[content_key]); + } + }); + }); + + // Start search. + search.start(); +} diff --git a/js/isotope.pkgd.min.js b/js/isotope.pkgd.min.js new file mode 100644 index 00000000..4d6c129c --- /dev/null +++ b/js/isotope.pkgd.min.js @@ -0,0 +1,12 @@ +/*! + * Isotope PACKAGED v3.0.6 + * + * Licensed GPLv3 for open source use + * or Isotope Commercial License for commercial use + * + * https://isotope.metafizzy.co + * Copyright 2010-2018 Metafizzy + */ + +!function(t,e){"function"==typeof define&&define.amd?define("jquery-bridget/jquery-bridget",["jquery"],function(i){return e(t,i)}):"object"==typeof module&&module.exports?module.exports=e(t,require("jquery")):t.jQueryBridget=e(t,t.jQuery)}(window,function(t,e){"use strict";function i(i,s,a){function u(t,e,o){var n,s="$()."+i+'("'+e+'")';return t.each(function(t,u){var h=a.data(u,i);if(!h)return void r(i+" not initialized. Cannot call methods, i.e. "+s);var d=h[e];if(!d||"_"==e.charAt(0))return void r(s+" is not a valid method");var l=d.apply(h,o);n=void 0===n?l:n}),void 0!==n?n:t}function h(t,e){t.each(function(t,o){var n=a.data(o,i);n?(n.option(e),n._init()):(n=new s(o,e),a.data(o,i,n))})}a=a||e||t.jQuery,a&&(s.prototype.option||(s.prototype.option=function(t){a.isPlainObject(t)&&(this.options=a.extend(!0,this.options,t))}),a.fn[i]=function(t){if("string"==typeof t){var e=n.call(arguments,1);return u(this,t,e)}return h(this,t),this},o(a))}function o(t){!t||t&&t.bridget||(t.bridget=i)}var n=Array.prototype.slice,s=t.console,r="undefined"==typeof s?function(){}:function(t){s.error(t)};return o(e||t.jQuery),i}),function(t,e){"function"==typeof define&&define.amd?define("ev-emitter/ev-emitter",e):"object"==typeof module&&module.exports?module.exports=e():t.EvEmitter=e()}("undefined"!=typeof window?window:this,function(){function t(){}var e=t.prototype;return e.on=function(t,e){if(t&&e){var i=this._events=this._events||{},o=i[t]=i[t]||[];return o.indexOf(e)==-1&&o.push(e),this}},e.once=function(t,e){if(t&&e){this.on(t,e);var i=this._onceEvents=this._onceEvents||{},o=i[t]=i[t]||{};return o[e]=!0,this}},e.off=function(t,e){var i=this._events&&this._events[t];if(i&&i.length){var o=i.indexOf(e);return o!=-1&&i.splice(o,1),this}},e.emitEvent=function(t,e){var i=this._events&&this._events[t];if(i&&i.length){i=i.slice(0),e=e||[];for(var o=this._onceEvents&&this._onceEvents[t],n=0;n