Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding gsheets_ts #468

Merged
merged 17 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions data/recipes/gsheets_ts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"name": "gsheets_ts",
"description": "Collects data from google sheets and output them to Timesktech.",
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
"short_description": "Collects data from google sheets and output them to Timesktech.",
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
"preflights": [],
"modules": [{
"wants": [],
"name": "GoogleSheetsCollector",
"args": {
"spreadsheet": "@spreadsheet",
"sheets_names": "@sheets_names",
"validate_columns" :"@validate_columns"
}
}, {
"wants": ["GoogleSheetsCollector"],
"name": "TimesketchExporter",
"args": {
"incident_id": "@incident_id",
"token_password": "@token_password",
"sketch_id": "@sketch_id",
"analyzers": null,
"wait_for_timelines": "@wait_for_timelines"
}
}],
"args": [
["spreadsheet", "ID or URL of the Google Sheet spreadsheet to collect data from.", null],
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
["--sheets_names", "Comma-separated list sheets names to collect date from. Default is 'All' which parses all sheets in the spreadsheet.", ["All"]],
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
["--validate_columns","Set to True to check for mandatory columns required by TimeSketch while extracting data. Set to False to ignore validation. Default is True.", true],
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
["--sketch_id", "Sketch to which the timeline should be added", null],
["--token_password", "Optional custom password to decrypt Timesketch credential file with", ""],
["--incident_id", "Incident ID (used for Timesketch description)", null],
["--wait_for_timelines", "Whether to wait for timelines to finish processing.", true]
]
}
1 change: 1 addition & 0 deletions dftimewolf/cli/dftimewolf_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
'WorkspaceAuditTimesketch': 'dftimewolf.lib.processors.workspace_audit_timesketch',
'TimesketchExporterThreaded': 'dftimewolf.lib.exporters.timesketch_tam',
'TurbiniaGCPProcessorThreaded': 'dftimewolf.lib.processors.turbinia_gcp_tam',
'GoogleSheetsCollector': 'dftimewolf.lib.collectors.gsheets',
}
# pylint: enable=line-too-long

Expand Down
246 changes: 246 additions & 0 deletions dftimewolf/lib/collectors/gsheets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
# -*- coding: utf-8 -*-
"""Pulls entries from Google Sheets."""

import os.path
import re
import tempfile
from typing import List, Optional, TYPE_CHECKING

from pandas.core.frame import DataFrame

from dftimewolf.lib import module
from dftimewolf.lib.containers import containers
from dftimewolf.lib.modules import manager as modules_manager
import filelock
from google.auth.exceptions import DefaultCredentialsError
from google.auth.exceptions import RefreshError
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient import discovery
import pandas as pd

if TYPE_CHECKING:
from dftimewolf.lib import state


class GoogleSheetsCollector(module.BaseModule):
"""Collector for entries from Google Sheets. """

SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

_CREDENTIALS_FILENAME = '.dftimewolf_google_sheets_credentials.json'
_CLIENT_SECRET_FILENAME = '.dftimewolf_google_sheets_client_secret.json'

def __init__(self,
state: 'state.DFTimewolfState',
name: Optional[str] = None,
critical: bool = False) -> None:
"""Initializes a Google Sheets collector."""
super(GoogleSheetsCollector, self).__init__(
state, name=name, critical=critical)
self._credentials = None
self._spreadsheet_id = ''
self._sheets_names = ''
# These are mandatory columns required by Timesketch.
self._mandatory_columns = ['message', 'datetime', 'timestamp_desc']
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
self._all_sheets = False

# pylint: disable=arguments-differ
def SetUp(self, spreadsheet: str, sheets_names: List[str], validate_columns: bool=True) -> None:
"""Sets up a a Google Sheets collector.

Args:
spreadsheet: ID or URL of the sheet to pull data from
sheets_names: List of sheets names inside the spreadsheet to parse. 'All'
itsmvd marked this conversation as resolved.
Show resolved Hide resolved
will parse all sheets inside a spreadsheet.
validate_columns: Check if mandatory columns required by Timesketch is
present in the sheets.
"""
self._credentials = self._GetCredentials()
self._spreadsheet_id = self._ValidateSpreadSheetId(spreadsheet)
self._sheets_names = sheets_names
if 'all' in (sheet.lower() for sheet in sheets_names):
self._all_sheets = True
self._validate_columns = validate_columns

def Process(self) -> None:
"""Copies entries from Google Sheets."""

try:
# Retrive list of sheets in the spreadsheet
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
service = self._BuildSheetsResource(self._credentials)
result = service.spreadsheets().get(
spreadsheetId=self._spreadsheet_id).execute()
spreadsheet_title = result.get('properties', {}).get('title')
sheets = (result.get('sheets', []))
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

for sheet in sheets:
if not sheet.get('properties'):
continue

sheet_title = sheet.get('properties', {}).get('title')
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

if not self._all_sheets and sheet_title not in self._sheets_names:
continue

self.logger.info('Parsing sheet: {0:s}'.format(sheet_title))

df = self._ExtractEntiresFromSheet(self._spreadsheet_id, sheet_title)

if df is None or df.empty:
continue
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

output_file = tempfile.NamedTemporaryFile(
mode='w', delete=False, encoding='utf-8', suffix='.csv')
output_path = output_file.name
self.logger.info(
'Downloading results to of sheet "{0:s}" to {1:s}'.format(
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
sheet_title, output_path))

df.to_csv(index=False, na_rep='NaN', path_or_buf=output_file)

self.logger.success(
'Downloaded results of sheet "{0:s}" to {1:s}'.format(
sheet_title, output_path))
output_file.close()
sheet_csv_file = containers.File(
name='{0:s}_{1:s}'.format(spreadsheet_title, sheet_title),
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
path=output_path)
self.state.StoreContainer(sheet_csv_file)
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

except (RefreshError, DefaultCredentialsError) as exception:
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
self.ModuleError('Something is wrong with your gcloud access token or '
'Application Default Credentials. Try running:\n '
'$ gcloud auth application-default login')
self.ModuleError(exception, critical=True)

def _GetCredentials(self) -> Credentials:
itsmvd marked this conversation as resolved.
Show resolved Hide resolved
"""Obtains API credentials for accessing the Google Sheets API.

Returns:
google.oauth2.credentials.Credentials: Google API credentials.
"""
credentials = None

# The credentials file stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
credentials_path = os.path.join(
os.path.expanduser('~'), self._CREDENTIALS_FILENAME)
lock = filelock.FileLock(credentials_path + '.lock') # pylint: disable=abstract-class-instantiated
with lock:
if os.path.exists(credentials_path):
try:
credentials = Credentials.from_authorized_user_file(
credentials_path, self.SCOPES)
except ValueError as exception:
self.logger.warning(
'Unable to load credentials: {0:s}'.format(exception))
credentials = None

# If there are no (valid) credentials available, let the user log in.
if not credentials or not credentials.valid:
if credentials and credentials.expired and credentials.refresh_token:
credentials.refresh(Request())
else:
secrets_path = os.path.join(
os.path.expanduser('~'), self._CLIENT_SECRET_FILENAME)
if not os.path.exists(secrets_path):
error_message = (
'No OAuth application credentials available to access google '
'sheets. Please generate OAuth application credentials (see '
'https://developers.google.com/sheets/api/guides/authorizing) '
'and save them to {0:s}.').format(secrets_path)
self.ModuleError(error_message, critical=True)
flow = InstalledAppFlow.from_client_secrets_file(
secrets_path, self.SCOPES)
credentials = flow.run_console()

# Save the credentials for the next run
with open(credentials_path, 'w') as token_file:
token_file.write(credentials.to_json())

return credentials

def _BuildSheetsResource(self, credentials: Credentials) -> discovery.Resource:
"""Builds a Google Sheets resource object to use to request logs.

Args:
credentials: Google API credentials

Returns:
A resouce object for interacting with the Google Sheets API.
"""
return discovery.build('sheets', 'v4', credentials=credentials)

def _ValidateSpreadSheetId(self, spreadsheet: str) -> str:
"""Extract the spreadsheet id if the input is a URL and validate that the ID
is in the correct format.

Args:
spreadsheet: ID or URL of the sheetspread,

Returns:
spreadsheet ID
"""
spreadsheet_match = re.search(r'.*?([01][0-9A-Za-z_-]{20,}).*',
spreadsheet)
if not spreadsheet_match:
self.ModuleError(
'spreadsheet id is not in the correct format {0:s}.'.format(
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
spreadsheet),
critical=True)

return spreadsheet_match.group(1)

def _ExtractEntiresFromSheet(self, spreadsheet_id: str, sheet_title: str) -> DataFrame:
"""Extract entries from the sheet inside the spreadsheet and return a
DataFrame with the content

Args:
spreadsheet_id: ID of the spreadsheet to pull data from
sheets_title: Title of the sheet inside the spreadsheet to parse.

Returns:
Dataframe with entries from sheet inside the spreadsheet
"""

df = pd.DataFrame()
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

resource = self._BuildSheetsResource(self._credentials)

sheet_content_result = resource.spreadsheets().values().get(
spreadsheetId=spreadsheet_id, range=sheet_title).execute()
values = sheet_content_result.get('values', [])

if not values:
self.logger.warning('No data found in sheet "{0:s}".'.format(sheet_title))
return None
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

df = pd.DataFrame(values[1:], columns=values[0])
df.replace('', 'NaN', inplace=True)
df.fillna('NaN', inplace=True)

# Removing white spaces from column names
df.rename(columns=lambda name: name.strip(), inplace=True)

if self._validate_columns:
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved
for column in self._mandatory_columns:
if column not in df.columns:
self.logger.error(
'Mandatory column "{0:s}" was not found in sheet "{1:s}".'.format(
column, sheet_title))
self.logger.error('Please make sure all mandatory are present:')
self.logger.error(
'"message": String with an informative message of the event')
self.logger.error(
'"datetime": ISO8601 format for example: 2015-07-24T19:01:01+00:00')
self.logger.error(
'"timestamp_desc": String explaining what type of timestamp it is for example file created'
)
return None

return df

modules_manager.ModulesManager.RegisterModule(GoogleSheetsCollector)
90 changes: 90 additions & 0 deletions tests/lib/collectors/gsheets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests the Workspace logging timesketch processor."""

import unittest

from dftimewolf import config
from dftimewolf.lib import errors
from dftimewolf.lib import state
from dftimewolf.lib.collectors import gsheets
import mock


VALID_SHEET = {'range': 'Sheet1!A1:Z1000', 'majorDimension': 'ROWS', 'values': [['message', 'timestamp', 'datetime', 'timestamp_desc ', 'extra_field_1 ', 'extra_field_2'], ['A message', '1331698658276340', '2015-07-24T19:01:01+00:00', 'Write time', 'foo ', 'bar'], ['', '1331698658276340', '2016-07-24T19:01:01+00:00', 'create', 'dodo' ], ['sdsadasd', '', '', 'ddd', 'dodo', 'd']]} # pylint: disable=line-too-long
# Missing "datetime" columnd
INVALID_SHEET = {'range': 'Sheet2!A1:Y1000', 'majorDimension': 'ROWS', 'values': [['message', 'timestamp', 'timestamp_desc ', 'extra_field_1 ','extra_field_2'], ['A message', '1331698658276340', 'Write time', 'foo ', 'bar'],['', '1331698658276340', 'create', 'dodo'],['sdsadasd', '', 'ddd', 'dodo', 'd']]} # pylint: disable=line-too-long
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

class GoogleSheetsCollectorTest(unittest.TestCase):
"""Tests for the Google Sheets collector module ."""

def testInitialization(self):
"""Tests that the collector can be initialized."""
test_state = state.DFTimewolfState(config.Config)
collector = gsheets.GoogleSheetsCollector(test_state)
self.assertIsNotNone(collector)

def testValidateSpreadSheetId(self):
"""Tests that the collector validate and extract spreadsheet id."""
test_state = state.DFTimewolfState(config.Config)
collector = gsheets.GoogleSheetsCollector(test_state)

with self.assertRaises(errors.DFTimewolfError) as error:
invalid_id = 'invalid-id'
collector._ValidateSpreadSheetId(invalid_id)

valid_id = '1DD78vj61BEBoqpw69EdOoaxBUdDqM1GFxk5qRj7-vr4'
self.assertEqual(
collector._ValidateSpreadSheetId(valid_id),
'1DD78vj61BEBoqpw69EdOoaxBUdDqM1GFxk5qRj7-vr4')

with self.assertRaises(errors.DFTimewolfError) as error:
invalid_id_in_url = 'https://docs.google.com/spreadsheets/d/invalid-id/edit#gid=0'
collector._ValidateSpreadSheetId(invalid_id_in_url)

valid_id_in_url = 'https://docs.google.com/spreadsheets/d/1DD78vj61BEBoqpw69EdOoaxBUdDqM1GFxk5qRj7-vr4/edit#gid=0'
collector._ValidateSpreadSheetId(valid_id_in_url)
self.assertEqual(
collector._ValidateSpreadSheetId(valid_id_in_url),
'1DD78vj61BEBoqpw69EdOoaxBUdDqM1GFxk5qRj7-vr4')

@mock.patch('dftimewolf.lib.collectors.gsheets.discovery')
def testExtractEntiresFromSheet(self, _mock_discovery):
"""Tests that the collector can extract entries from a valid sheet and
returns None for invalid sheet if validate_columns is True. And that it
can extract entries from both valid and invalid sheets if validate_columns
is False.
"""
test_state = state.DFTimewolfState(config.Config)
collector = gsheets.GoogleSheetsCollector(test_state)

spreadsheet_id = '1DD78vj61BEBoqpw69EdOoaxBUdDqM1GFxk5qRj7-vr4'
sheet_title = 'Sheet1'

service = _mock_discovery.build.return_value

# Testing with column validation is True
collector.SetUp(spreadsheet_id, sheet_title, True)

service.spreadsheets.return_value.values.return_value.get.return_value.execute.return_value = VALID_SHEET

self.assertIsNotNone(collector._ExtractEntiresFromSheet(spreadsheet_id, sheet_title))

service.spreadsheets.return_value.values.return_value.get.return_value.execute.return_value = INVALID_SHEET

self.assertIsNone(collector._ExtractEntiresFromSheet(spreadsheet_id, sheet_title))
hkhalifa marked this conversation as resolved.
Show resolved Hide resolved

# Testing with column validation is False
collector.SetUp(spreadsheet_id, sheet_title, False)

service.spreadsheets.return_value.values.return_value.get.return_value.execute.return_value = VALID_SHEET

self.assertIsNotNone(collector._ExtractEntiresFromSheet(spreadsheet_id, sheet_title))

service.spreadsheets.return_value.values.return_value.get.return_value.execute.return_value = INVALID_SHEET

self.assertIsNotNone(collector._ExtractEntiresFromSheet(spreadsheet_id, sheet_title))


if __name__ == '__main__':
unittest.main()