Skip to content

Commit

Permalink
Added DAG file
Browse files Browse the repository at this point in the history
  • Loading branch information
ErikaJacobs committed Jul 4, 2020
1 parent a008911 commit d92a75e
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 41 deletions.
50 changes: 50 additions & 0 deletions acnh_dag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Airflow DAG

from datetime import timedelta

import airflow
from airflow import DAG
from airflow.operators.python_operator import PythonOperator

# Default Argument

default_args = {
'owner': 'airflow',
'start_date': airflow.utils.dates.days_ago(1),
# 'end_date': datetime(2018, 12, 30),
'depends_on_past': True,
'email': ['ErikaAshley3@gmail.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 0,
'retry_delay': timedelta(minutes=5),
'provide_context': True}

# DAG

dag = DAG(
'ACNH_Popularity',
default_args=default_args,
description='Scrapes Animal Crossing Villager data and sends to MySQL',
schedule_interval='0 0 1,15 * *')

# Tasks

# Set Working Directory

# Input - Script Directory
file = '/mnt/c/Users/cluel/Documents/GitHub/Animal-Crossing-Popularity-Data'

####################################

# Import Class
from scripts import acnh_pop as ac
cl = ac.acnh_pop_class()

# Create Tasks
t1 = PythonOperator(task_id='scrape_web_data', python_callable = cl.acnGetPopData, dag=dag)
t2 = PythonOperator(task_id='df_and_mysql', python_callable = cl.getVillagerInfo, dag=dag)

# Task Organization

t1 >> t2
59 changes: 18 additions & 41 deletions acnh_pop.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

#%%
class acnh_pop_class:
def acnGetPopData(self):
def acnGetPopData(self, **kwargs):
from selenium import webdriver

options = webdriver.ChromeOptions()
Expand All @@ -22,10 +22,21 @@ def acnGetPopData(self):
time.sleep(2)

page_source = driver.page_source

task_instance = kwargs['task_instance']
page_source = task_instance.xcom_push(key="page_source", value=page_source)

return page_source


#%%
def getVillagerInfo(self, page_source):
def getVillagerInfo(self, **kwargs):

import sys
sys.setrecursionlimit(10000)

task_instance = kwargs['task_instance']
page_source = task_instance.xcom_pull(task_ids='scrape_web_data', key='page_source')

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -65,13 +76,11 @@ def getVillagerInfo(self, page_source):
'villager_tier': villager_tier,
'villager_value': villager_value}

return villager_field_dict
#%%
#task_instance = kwargs['task_instance']
#villager_field_dict = task_instance.xcom_push(key="villager_field_dict", value=villager_field_dict)

# Create Pandas Dataframe
# Create df

def acnhpop_df(self, villager_field_dict):

import pandas as pd
import datetime

Expand Down Expand Up @@ -112,12 +121,6 @@ def name_change(x):
return name

df['villager_name'] = df['villager_name'].apply(lambda x: name_change(x))

return df

#%%

def kaggle_data(self):

# Get Table from Kaggle

Expand Down Expand Up @@ -151,15 +154,8 @@ def kaggle_data(self):

df_kag = pd.read_csv(csv_after)

return df_kag

#%%

# Join Kaggle and Popularity together

def joinTables(self, df, df_kag):

import pandas as pd
def push_function():
return df_kag

df_final = pd.merge(df, df_kag, how='left', left_on=['villager_name'], right_on=['Name'])

Expand Down Expand Up @@ -191,12 +187,6 @@ def birthday_clean(x):
continue
else:
df_final.rename(columns={old:new}, inplace=True)

return df_final

#%%

def mysql(self, df_final):

import datetime

Expand Down Expand Up @@ -261,19 +251,6 @@ def mysql(self, df_final):

#%%

#%%
# Procedure

#from datetime import datetime
#NOW = datetime.now()

#page_source = acnGetPopData()
#villager_field_dict = getVillagerInfo(page_source)
#df = acnhpop_df(villager_field_dict)
#df_kag = kaggle_data()
#df_final = joinTables(df, df_kag)
#mysql(df_final)

#%%
# Sources
# https://medium.com/ymedialabs-innovation/web-scraping-using-beautiful-soup-and-selenium-for-dynamic-page-2f8ad15efe25
Expand Down

0 comments on commit d92a75e

Please sign in to comment.