@@ -21,11 +21,13 @@ static void Main(string[] args)
2121
2222 string filePath = args [ 0 ] ;
2323
24+ // Create Spark session
2425 SparkSession spark = SparkSession
2526 . Builder ( )
2627 . AppName ( "GitHub and Spark Batch" )
2728 . GetOrCreate ( ) ;
2829
30+ // Create initial DataFrame
2931 DataFrame projectsDf = spark
3032 . Read ( )
3133 . Schema ( "id INT, url STRING, owner_id INT, " +
@@ -34,6 +36,7 @@ static void Main(string[] args)
3436 "updated_at STRING" )
3537 . Csv ( filePath ) ;
3638
39+ // Display results
3740 projectsDf . Show ( ) ;
3841
3942 // Drop any rows with NA values
@@ -42,27 +45,33 @@ static void Main(string[] args)
4245
4346 // Remove unnecessary columns
4447 cleanedProjects = cleanedProjects . Drop ( "id" , "url" , "owner_id" ) ;
48+
49+ // Display results
4550 cleanedProjects . Show ( ) ;
4651
4752 // Average number of times each language has been forked
4853 DataFrame groupedDF = cleanedProjects
4954 . GroupBy ( "language" )
5055 . Agg ( Avg ( cleanedProjects [ "forked_from" ] ) ) ;
5156
52- // Sort by most forked languages first
57+ // Sort by most forked languages first & Display results
5358 groupedDF . OrderBy ( Desc ( "avg(forked_from)" ) ) . Show ( ) ;
5459
60+ // Defines a UDF that determines if a date is greater than a specified date
5561 spark . Udf ( ) . Register < string , bool > (
5662 "MyUDF" ,
5763 ( date ) => DateTime . TryParse ( date , out DateTime convertedDate ) &&
5864 ( convertedDate > referenceDate ) ) ;
5965
66+ // Use UDF to add columns to the generated TempView
6067 cleanedProjects . CreateOrReplaceTempView ( "dateView" ) ;
61-
6268 DataFrame dateDf = spark . Sql (
6369 "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView" ) ;
70+
71+ // Display results
6472 dateDf . Show ( ) ;
6573
74+ // Stop Spark session
6675 spark . Stop ( ) ;
6776 }
6877 }
0 commit comments