notes

ImTaliesin · Sep 19, 2024 · e8ab34b · e8ab34b
1 parent 00368f1
commit e8ab34b
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 18 deletions.
diff --git a/.obsidian/graph.json b/.obsidian/graph.json
@@ -25,6 +25,6 @@
   "repelStrength": 12.2837370242215,
   "linkStrength": 1,
   "linkDistance": 30,
-  "scale": 0.8283218182655613,
+  "scale": 0.8283218182655638,
   "close": true
 }
diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json
@@ -11,12 +11,8 @@
             "id": "71c4ca83aff589db",
             "type": "leaf",
             "state": {
-              "type": "markdown",
-              "state": {
-                "file": "Data Engineering Notes.md",
-                "mode": "source",
-                "source": false
-              }
+              "type": "graph",
+              "state": {}
             }
           },
           {
@@ -71,12 +67,15 @@
             "id": "31b3f5731acf73c3",
             "type": "leaf",
             "state": {
-              "type": "graph",
-              "state": {}
+              "type": "markdown",
+              "state": {
+                "file": "SQL.md",
+                "mode": "source",
+                "source": false
+              }
             }
           }
-        ],
-        "currentTab": 5
+        ]
       }
     ],
     "direction": "vertical"
@@ -105,7 +104,7 @@
             "state": {
               "type": "search",
               "state": {
-                "query": "",
+                "query": "dataframe",
                 "matchingCase": false,
                 "explainSearch": false,
                 "collapseAll": false,
@@ -223,29 +222,29 @@
       "table-editor-obsidian:Advanced Tables Toolbar": false
     }
   },
-  "active": "31b3f5731acf73c3",
+  "active": "71c4ca83aff589db",
   "lastOpenFiles": [
     "Data Engineering Notes.md",
+    "SQL.md",
+    "databricks.md",
+    "Data Factory.md",
+    "Apache Spark.md",
+    "Data Lake.md",
     "Azure Synapse.md",
     "Pasted image 20240915181049.png",
     "Pasted image 20240915180527.png",
-    "Apache Spark.md",
     "SparkSQL.md",
     "Data Pipelines.md",
     "Azure Databricks.md",
     "Data Ingestion.md",
     "Delta Lake.md",
-    "SQL.md",
-    "databricks.md",
     "Unity Catalog.md",
     "Pasted image 20240911152450.png",
     "Pasted image 20240906143943.png",
     "Pasted image 20240904201734.png",
     "Pasted image 20240903232241.png",
     "Data Warehouse.md",
-    "Data Factory.md",
     "Azure Databases.md",
-    "Data Lake.md",
     "Fullstack NextJS List.md",
     "Database.md",
     "Pasted image 20240830192309.png",

diff --git a/SQL.md b/SQL.md
@@ -37,7 +37,30 @@ create view course_project.citibike.vw_bike_data as select * from course_project
 Deleting view:
 `DROP VIEW view_name`
 ## SQL Vocab
+### OPENROWSET
+OPENROWSET allows reading remote files without loading them into tables or creating external tables. Here are the key points:
+- Enables reading files from Azure Storage
+- Returns file contents as a set of rows with columns
+- Can be used in the FROM clause of a SELECT statement like a table or view
+- Supports reading files in CSV, Parquet, and Delta formats
+#### Syntax
 
+The basic syntax for OPENROWSET includes two mandatory parameters:
+
+1. BULK: Specifies the URL of the file or folder in Azure storage
+2. FORMAT: Indicates the file format (CSV, Parquet, or Delta)
+
+#### Example:
+
+```
+SELECT * FROM OPENROWSET(
+BULK 'https://storage_account.blob.core.windows.net/container/file.csv',  FORMAT = 'CSV' ) 
+AS [result]
+```
+#### Additional Parameters
+For CSV files, optional parameters can be specified, such as:
+- Delimiter
+- Whether the file contains a header row
 ### SELECT
 - Used to retrieve data from one or more tables
 - Basic syntax: `SELECT column1, column2, ... FROM table_name;`
@@ -364,6 +387,61 @@ Correct:
 #### Remember:
 - Use WHERE for filtering individual rows based on column values.
 - Use HAVING for filtering groups based on the results of aggregate functions.
+## Create an external data source
+```sql
+create external data source nyc_taxi_data
+with (
+	LOCATION = 'abfss:nyc-taxi-data@synapseudemycoursedatalakes.dfs.core.windows.net/'
+)
+```
+## UTF8 Characters/Collation
+- COLLATE determines how string data is compared and sorted in a database.
+- Strings can be stored in different ways. You can apply collation to a column in two different ways.
+```
+--Collate data per column
+SELECT
+    *
+FROM
+    OPENROWSET(
+        BULK 'https://synapsecoursedatalakes.dfs.core.windows.net/nyc-taxi-data/raw/taxi_zone.csv',
+        FORMAT = 'CSV',
+        PARSER_VERSION = '2.0',
+        HEADER_ROW=TRUE
+    )
+    WITH (
+        LocationID SMALLINT,
+        Borough VARCHAR(15) COLLATE Latin1_General_100_CI_AI_SC_UTF8,
+        Zone VARCHAR(50) COLLATE Latin1_General_100_CI_AI_SC_UTF8,
+        service_zone VARCHAR(15) COLLATE Latin1_General_100_CI_AI_SC_UTF8)
+    AS fields
+```
+
+```
+--Collate the entire database
+SELECT
+    *
+FROM
+    OPENROWSET(
+        BULK 'https://synapsecoursedatalakes.dfs.core.windows.net/nyc-taxi-data/raw/taxi_zone.csv',
+        FORMAT = 'CSV',
+        PARSER_VERSION = '2.0',
+        HEADER_ROW=TRUE
+    )
+    WITH (
+        LocationID SMALLINT,
+        Borough VARCHAR(15) COLLATE Latin1_General_100_CI_AI_SC_UTF8,
+        Zone VARCHAR(50) COLLATE Latin1_General_100_CI_AI_SC_UTF8,
+        service_zone VARCHAR(15) COLLATE Latin1_General_100_CI_AI_SC_UTF8)
+    AS fields
+
+CREATE DATABASE nyc_taxi_discovery;
+
+USE nyc_taxi_discovery;
+
+ALTER DATABASE nyc_taxi_discovery COLLATE Latin1_General_100_CI_AI_SC_UTF8;
+
+SELECT name, collation_name FROM sys.databases;
+```
 ## Auth
 Serverless SQL pool authentication refers to how users prove their identity when connecting to the endpoint. Two types of authentication are supported: