added readme.txt

2017-10-15 23:09:54 -07:00 · 2017-10-15 23:09:54 -07:00 · fe70d0736d
--- a/bulkimport/Readme.txt
+++ b/bulkimport/Readme.txt
@ -0,0 +1,22 @@
+The following numbers are measured when inserting 1,000,000 Documents each of size 1KB against a collection with 1000K throughput with 100 partitions. Bulk import tool ran in Gateway mode on an Ubuntu Linux with 16 CPU cores in azure portal.
+
+Number of documents inserted in this checkpoint: 1,000,000
+Import time for this checkpoint in milli seconds 9,106
+Total request unit consumed in this checkpoint: 5,708,431.08
+Average RUs/second in this checkpoint: 626,886
+Average #Inserts/second in this checkpoint: 109,817
+
+############################################################
+############################################################
+
+NOTE: for getting higher throughput:
+1) Set JVM heap size to a large enough number to avoid any memory issue in handling large number of documents 
+2) there is a preprocessing and warm up time and due that, you will get higher throughput for bulks with larger number of documents. So, if you want to import 10,000,000 documents, running bulk import 10 times on 10 bulk of documents each of size 1,000,000 is more preferable than running bulk import 100 times on 100 bulk of documents each of size 100,000 documents. 
+
+an example for how to use the bulk import:
+https://github.com/Azure/azure-documentdb-java/blob/moderakh/bulkimport-implementation/bulkimport/src/test/java/com/microsoft/azure/documentdb/bulkimport/Sample.java
+
+sample invocation of command-line tool for benchmarking bulk-import (doing 5 iterations of bulk import and in each iterations it inserts 1,000,000 documents) 
+ 
+java -Xmx6G  -jar documentdb-bulkimport-1.0.0-SNAPSHOT-jar-with-dependencies.jar -serviceEndpoint ACCOUNT_HOST -masterKey ACCOUNT_MASTER_KEY -databaseId DATABASE_NAME -collectionId COLLECTION_NAME -maxConnectionPoolSize 200 -numberOfDocumentsForEachCheckpoint 1000000 -numberOfCheckpoints 5
+
--- a/bulkimport/src/test/java/com/microsoft/azure/documentdb/bulkimport/Sample.java
+++ b/bulkimport/src/test/java/com/microsoft/azure/documentdb/bulkimport/Sample.java
@ -22,10 +22,7 @@
 */
 package com.microsoft.azure.documentdb.bulkimport;

-import java.util.ArrayList;
 import java.util.Collection;
-import java.util.List;
-import java.util.UUID;
 import java.util.concurrent.ExecutionException;

 import com.microsoft.azure.documentdb.ConnectionPolicy;
@ -50,6 +47,14 @@ public class Sample {

            try(BulkImporter importer = new BulkImporter(client, collection)) {

+                //NOTE: for getting higher throughput please
+                // 1)  Set JVM heap size to a large enough number to avoid any memory issue in handling large number of documents 
+                // 2)  there is a pre-processing and warm up time and due that,
+                //     you will get higher throughput for bulks with larger number of documents. 
+                //     So if you want to import 10,000,000 documents, 
+                //     running bulk import 10 times on 10 bulk of documents each of size 1,000,000 is more preferable
+                //     than running bulk import 100 times on 100 bulk of documents each of size 100,000 documents. 
+   
                for(int i = 0; i< 10; i++) {
                    Collection<String> docs = DataMigrationDocumentSource.loadDocuments(1000000, collection.getPartitionKey());
                    BulkImportResponse bulkImportResponse = importer.bulkImport(docs, false);