Updating HDInsight lab
This commit is contained in:
Родитель
d7922b6f8c
Коммит
dc7af7adec
|
@ -28,7 +28,7 @@ Learn the basics of data science using Spark
|
|||
This notebook demonstrates how to use MLLib, Sparks's built-in machine
|
||||
learning libraries, to perform a simple prediction on an open dataset.
|
||||
|
||||
**Launch Jupyter Notebooks **
|
||||
### Launch Jupyter Notebook
|
||||
Navigate to this link and add your cluster name and username/password provided. https://<Fill_ME_IN>.azurehdinsight.net/jupyter/tree/PySpark
|
||||
|
||||
- Username: <FILL\_ME\_IN>
|
||||
|
@ -61,7 +61,7 @@ Read the dataset from a csv file stored in Azure Blob Storage.
|
|||
|
||||
```python
|
||||
inspections =
|
||||
spark.read.csv('wasb:///HdiSamples/HdiSamples/FoodInspectionData/FoodInspections1.csv',
|
||||
spark.read.csv('wasb:///HdiSamples/HdiSamples/FoodInspectionData/Food_Inspections1.csv',
|
||||
inferSchema=True)
|
||||
```
|
||||
#### Inspect Schema
|
||||
|
@ -86,12 +86,12 @@ df.select('results').distinct().show()
|
|||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
labels = count\_results\_df\['results'\]
|
||||
labels = count_results_df['results']
|
||||
|
||||
sizes = count\_results\_df\['cnt'\]
|
||||
sizes = count_results_df['cnt']
|
||||
|
||||
colors = \['turquoise', 'seagreen', 'mediumslateblue', 'palegreen',
|
||||
'coral'\]
|
||||
colors = ['turquoise', 'seagreen', 'mediumslateblue', 'palegreen',
|
||||
'coral']
|
||||
|
||||
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors)
|
||||
|
||||
|
@ -110,7 +110,7 @@ We can use the model we created earlier to predict what the results of
|
|||
new inspections will be, based on the violations that were observed.
|
||||
```python
|
||||
testData =
|
||||
selectInterestingColumns(spark.read.csv('wasb:///HdiSamples/HdiSamples/FoodInspectionData/Food\_Inspections2.csv',
|
||||
selectInterestingColumns(spark.read.csv('wasb:///HdiSamples/HdiSamples/FoodInspectionData/Food_Inspections2.csv',
|
||||
inferSchema=True))
|
||||
|
||||
testDf = testData.where("results = 'Fail' OR results = 'Pass' OR results
|
||||
|
@ -137,7 +137,7 @@ print("There were %d inspections and there were %d successful
|
|||
predictions" % (numInspections, numSuccesses))
|
||||
|
||||
print("This is a %d%% success rate" % (float(numSuccesses) /
|
||||
float(numInspections) \* 100))
|
||||
float(numInspections) * 100))
|
||||
```
|
||||
#### Final visualization to help us reason about the results of this test.
|
||||
|
||||
|
@ -302,7 +302,7 @@ print("There were %d User sessions and there were %d successful
|
|||
predictions" % (numInspections, numSuccesses))
|
||||
|
||||
print("This is a %d%% success rate" % (float(numSuccesses) /
|
||||
float(numInspections) \* 100))
|
||||
float(numInspections) * 100))
|
||||
```
|
||||
#### Final visualization to help us reason about the results of this test.
|
||||
|
||||
|
|
|
@ -83,8 +83,7 @@ PaymentAmount varchar(50)
|
|||
|
||||
STORED AS TEXTFILE LOCATION 'wasb:///hadooplabs/Lab1/weblogs/';
|
||||
|
||||
LOAD DATA INPATH 'wasb:///hadooplabs/Lab1/weblogs.csv' INTO TABLE
|
||||
HDILABDB.weblogs;
|
||||
LOAD DATA INPATH 'wasb:///hadooplabs/Lab1/weblogs.csv' INTO TABLE HDILABDB.weblogs;
|
||||
```
|
||||
- Click Execute to run the query. Once the query complete, the Query
|
||||
Process Results, status will change to **SUCCEEDED**.
|
||||
|
@ -202,14 +201,11 @@ month. The output should look like this.
|
|||
|
||||
DROP TABLE IF EXISTS HDILABDB.SalesbyCategory;
|
||||
|
||||
CREATE TABLE HDILABDB.SalesbyCategory ROW FORMAT DELIMITED
|
||||
|
||||
FIELDS TERMINATED by '\1' lines TERMINATED by '\n'
|
||||
CREATE TABLE HDILABDB.SalesbyCategory ROW FORMAT DELIMITED FIELDS TERMINATED by '\1' lines TERMINATED by '\n'
|
||||
|
||||
STORED AS TEXTFILE LOCATION 'wasb:///hadooplabs/Lab1/SalesbyCategory'
|
||||
|
||||
AS
|
||||
|
||||
Select
|
||||
|
||||
categoryname,
|
||||
|
@ -218,12 +214,8 @@ Sum(Quantity) As quantitysold,
|
|||
|
||||
Sum(PaymentAmount) As totalamount
|
||||
|
||||
FROM HDILABDB.weblogs
|
||||
|
||||
WHERE PurchaseType="Purchased"
|
||||
|
||||
FROM HDILABDB.weblogs WHERE PurchaseType="Purchased"
|
||||
GROUP BY CategoryName
|
||||
|
||||
ORDER BY QuantitySold Desc;
|
||||
|
||||
Select * from HDILABDB.SalesbyCategory LIMIT 10
|
||||
|
@ -250,10 +242,7 @@ sold per book. The output should look like this.
|
|||
-- Top Selling Books
|
||||
|
||||
DROP TABLE IF EXISTS HDILABDB.SalesbyBooks;
|
||||
|
||||
CREATE TABLE HDILABDB.SalesbyBooks ROW FORMAT DELIMITED FIELDS
|
||||
|
||||
TERMINATED by '\1' lines TERMINATED by '\n'
|
||||
CREATE TABLE HDILABDB.SalesbyBooks ROW FORMAT DELIMITED FIELDS TERMINATED by '\1' lines TERMINATED by '\n'
|
||||
|
||||
STORED AS TEXTFILE LOCATION 'wasb:///hadooplabs/Lab1/SalesbyBooks'
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче