Skip to content

Commit

Permalink
Merge pull request #5 from shauryauppal/master
Browse files Browse the repository at this point in the history
XLS Format to XLSX Format
  • Loading branch information
knadh authored Aug 27, 2020
2 parents 9f608aa + 9ed8ebc commit 14b4d54
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
34 changes: 17 additions & 17 deletions rbiparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""A utility for downloading, parsing and sanitizing bank database (IFSC, MICR, address etc.) Excel sheets from the RBI website.
Scrapes the RBI IFS .xls sheet dumps and imports them.
1. Reads all the .xls URLs from the RBI page
Scrapes the RBI IFS .xlsx sheet dumps and imports them.
1. Reads all the .xlsx URLs from the RBI page
2. Downloads each file into a directory and
converts it to a CSV in another directory.
3. Each file's http etag header is saved in a file (etags.json),
Expand Down Expand Up @@ -70,23 +70,23 @@


def get_sheet_urls(url):
"""Scrapes the RBI page and gets the list of .xls sheets."""
"""Scrapes the RBI page and gets the list of .xlsx sheets."""
r = requests.get(url)
if r.status_code != 200:
raise Exception("Invalid response from", url)

# Extract the urls.
s = soup(r.content, "lxml")
links = s.findAll("a", href=re.compile("\.xls$"))
links = s.findAll("a", href=re.compile("\.xlsx$"))

if len(links) < 1:
raise Exception("Couldn't find any .xls urls")
raise Exception("Couldn't find any .xlsx urls")

return [l["href"] for l in links]


def convert_xls_to_csv(src, target, headers):
"""Convert .xls to .csv files."""
def convert_xlsx_to_csv(src, target, headers):
"""Convert .xlsx to .csv files."""
try:
sheet = xlrd.open_workbook(src).sheet_by_index(0)
except Exception as e:
Expand Down Expand Up @@ -169,14 +169,14 @@ def download(url, target):
}


def download_all(scrape_url, xls_dir, etags_file):
def download_all(scrape_url, xlsx_dir, etags_file):
"""Download all files."""
urls = get_sheet_urls(scrape_url)
logger.info("%d sheets to download" % (len(urls),))

# Create xls folder if path doesn't exist
if not os.path.exists(xls_dir):
os.mkdir(xls_dir)
# Create xlsx folder if path doesn't exist
if not os.path.exists(xlsx_dir):
os.mkdir(xlsx_dir)

# HTTP urls don't work.
urls = [u.replace("http:", "https:") for u in urls]
Expand All @@ -189,7 +189,7 @@ def download_all(scrape_url, xls_dir, etags_file):
logger.info("%d - %s" % (n, url))

fname = url_to_file(url)
xls_path = xls_dir + "/" + fname
xlsx_path = xlsx_dir + "/" + fname

# Get the URL headers.
try:
Expand All @@ -198,7 +198,7 @@ def download_all(scrape_url, xls_dir, etags_file):

if url in etags and \
etags[url] == et and \
os.path.isfile(xls_path):
os.path.isfile(xlsx_path):

logger.info("> Same etag. Skipping")
continue
Expand All @@ -209,7 +209,7 @@ def download_all(scrape_url, xls_dir, etags_file):
etags[url] = et
save_etags(etags, "etags.json")

download(url, xls_path)
download(url, xlsx_path)
except Exception as e:
logger.exception(e)
continue
Expand All @@ -221,14 +221,14 @@ def convert_all(src, target, headers):
if not os.path.exists(target):
os.mkdir(target)

files = glob.glob(src + "/*.xls")
files = glob.glob(src + "/*.xlsx")
for x in files:
c = target + "/" + x.split("/")[-1].replace(".xls", ".csv")
c = target + "/" + x.split("/")[-1].replace(".xlsx", ".csv")

logger.info("%s -> %s" % (x, c))

try:
convert_xls_to_csv(x, c, headers)
convert_xlsx_to_csv(x, c, headers)
except Exception as e:
logger.error("Failed: " + str(e))

Expand Down
6 changes: 3 additions & 3 deletions rbiparser/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ def cli():
@cli.command()
@click.option('-s', '--source', type=click.STRING, default=SOURCE_URL,
help="Source url to download documents. Defaults to RBI data source.")
@click.option('-d', '--dest', type=click.Path(dir_okay=True), default="xls",
@click.option('-d', '--dest', type=click.Path(dir_okay=True), default="xlsx",
help="Download destination directory.")
@click.option('-e', '--etag', type=click.Path(file_okay=True), default="etags.json",
help="Etags file")
def download(source, dest, etag):
"""Download all listed bank documents from RBI as .xls format."""
"""Download all listed bank documents from RBI as .xlsx format."""
rbi.download_all(source, dest, etag)


@cli.command()
@click.option('-s', '--source', type=click.Path(dir_okay=True, exists=True), default="xls",
@click.option('-s', '--source', type=click.Path(dir_okay=True, exists=True), default="xlsx",
help="xls documents directory")
@click.option('-d', '--dest', type=click.Path(dir_okay=True), default="csv",
help="Target directory for CSV files.")
Expand Down

0 comments on commit 14b4d54

Please sign in to comment.