rm -r dp-203 -f
git clone https://github.com/MicrosoftLearning/dp-203-azure-data-engineer dp-203
cd dp-203/Allfiles/labs/01
./setup.ps1
-- This is auto-generated code
SELECT
TOP 100 *
FROM
OPENROWSET(
BULK 'https://datalakexxxxxxx.dfs.core.windows.net/files/sales/csv/**',
FORMAT = 'CSV',
PARSER_VERSION='2.0'
) AS [result]
-- Database for sales data
CREATE DATABASE Sales
COLLATE Latin1_General_100_BIN2_UTF8;
GO;
Use Sales;
GO;
-- External data is in the Files container in the data lake
CREATE EXTERNAL DATA SOURCE sales_data WITH (
LOCATION = 'https://datalakexxxxxxx.dfs.core.windows.net/files/'
);
GO;
-- Format for table files
CREATE EXTERNAL FILE FORMAT ParquetFormat
WITH (
FORMAT_TYPE = PARQUET,
DATA_COMPRESSION = 'org.apache.hadoop.io.compress.SnappyCodec'
);
GO;
USE Sales;
GO;
SELECT Item AS Product,
SUM(Quantity) AS ItemsSold,
ROUND(SUM(UnitPrice) - SUM(TaxAmount), 2) AS NetRevenue
FROM
OPENROWSET(
BULK 'sales/csv/*.csv',
DATA_SOURCE = 'sales_data',
FORMAT = 'CSV',
PARSER_VERSION = '2.0',
HEADER_ROW = TRUE
) AS orders
GROUP BY Item;
CREATE EXTERNAL TABLE ProductSalesTotals
WITH (
LOCATION = 'sales/productsales/',
DATA_SOURCE = sales_data,
FILE_FORMAT = ParquetFormat
)
AS
SELECT Item AS Product,
SUM(Quantity) AS ItemsSold,
ROUND(SUM(UnitPrice) - SUM(TaxAmount), 2) AS NetRevenue
FROM
OPENROWSET(
BULK 'sales/csv/*.csv',
DATA_SOURCE = 'sales_data',
FORMAT = 'CSV',
PARSER_VERSION = '2.0',
HEADER_ROW = TRUE
) AS orders
GROUP BY Item;
USE Sales;
GO;
CREATE PROCEDURE sp_GetYearlySales
AS
BEGIN
-- drop existing table
IF EXISTS (
SELECT * FROM sys.external_tables
WHERE name = 'YearlySalesTotals'
)
DROP EXTERNAL TABLE YearlySalesTotals
-- create external table
CREATE EXTERNAL TABLE YearlySalesTotals
WITH (
LOCATION = 'sales/yearlysales/',
DATA_SOURCE = sales_data,
FILE_FORMAT = ParquetFormat
)
AS
SELECT YEAR(OrderDate) AS CalendarYear,
SUM(Quantity) AS ItemsSold,
ROUND(SUM(UnitPrice) - SUM(TaxAmount), 2) AS NetRevenue
FROM
OPENROWSET(
BULK 'sales/csv/*.csv',
DATA_SOURCE = 'sales_data',
FORMAT = 'CSV',
PARSER_VERSION = '2.0',
HEADER_ROW = TRUE
) AS orders
GROUP BY YEAR(OrderDate)
END
statement, and observe that an error occurs.
Even though the script drops the external table, the folder containing the data is not deleted. To re-run the stored procedure (for example, as part of a scheduled data transformation pipeline), you must delete the old data.