From 5493a53440583142adc6cc9788501eb5c1c41bb8 Mon Sep 17 00:00:00 2001
From: annaekhoo <60131983+annaekhoo@users.noreply.github.com>
Date: Wed, 29 Jul 2020 10:56:05 +0100
Subject: [PATCH] Add files via upload

---
 insolvency_master.Rmd | 513 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 513 insertions(+)
 create mode 100644 insolvency_master.Rmd

diff --git a/insolvency_master.Rmd b/insolvency_master.Rmd
new file mode 100644
index 0000000..966a11d
--- /dev/null
+++ b/insolvency_master.Rmd
@@ -0,0 +1,513 @@
+---
+title: "Insolvency"
+output: html_document
+author: Anna Khoo and Paul Bradshaw
+date: 20/07/2020
+---
+
+```{r setup}
+.libPaths("D:/R/Libraries")
+pacman::p_load(tidyverse, zoo, ggplot2, janitor, rio, data.table)
+```
+
+## 1. Collate gazette scrape data
+
+Import and clean gazette scrape info
+
+
+```{r gazette scrape data import}
+filenames <- list.files(path="Gazette_scrape_new/", pattern="*.csv", full.names=TRUE)
+gazette_1 <- rbindlist(lapply(filenames,fread)) %>% 
+  remove_empty("cols")
+```
+
+Then add in older scrape, which needs to match columns first
+
+```{r echo=FALSE}
+filenames <-list.files(path="Gazette notices scrape/", pattern="*.tsv", full.names=TRUE)
+#set EOL to ¬ to avoid parsing issues
+gazette_2 <-  do.call(rbind, lapply(filenames, function(x) read_tsv(x, col_names = TRUE, quote = "¬")))
+
+gazette_2 <- gazette_2 %>% 
+  filter(is.na(Error)) %>% 
+  remove_empty("cols")
+
+write_csv(gazette_2, "gazette_2.csv")
+```
+# Tidying up and collating scrape datasets
+
+We need to deduplicate on notice url just in case, dob isn't useful, noticecode will need cleaning before it's useful to extract the code as gazette_2 not as tidy, "persdetail" can come out for the moment too (though could be useful for personal insolvency?)
+```{r bind gazette scrape data}
+gazette_1 <- gazette_1 %>% 
+  select(-V1, -dob,-persdetail)
+gazette_2 <- gazette_2 %>% 
+  select("address"="Regoffice","companyname"="Name", "compnum"="Companynum","noticecategory"="Category",
+         "noticecode"="Code", "noticetype"="Noticetype", "pubdate"="Pubdate","typeofliq"="Liquidationtype","url"="Source Url")
+
+gazette <- rbind(gazette_1, gazette_2)
+#clean env
+rm(gazette_1, gazette_2)
+
+gazette <- gazette[!duplicated(gazette$url),]
+```
+
+# Clean and filter to just insolvencies
+
+That dataset has quite a lot of non-entries. But we're only interested in insolvency type notices.
+
+```{r gazette clean, filter to liquidations etc}
+gazette$compnum <- gsub("NO DATA",NA,gazette$compnum)
+gazette <- gazette %>%
+  remove_constant() %>% 
+  filter(!is.na(compnum))
+
+#subset of notices for liquidation only, here we'll just do 'Appointment of Liquidators' but a custom subset here would allow multiple types
+#liquidations <- gazette %>% 
+  #filter(noticetype=="Appointment of Liquidators")
+
+insolvency_notices <- (c("Appointment of Liquidators","Appointment of Administrators", "Resolutions for Winding Up","Resolutions for Winding-Up","Appointment of Receivers","Winding-Up Orders"))
+
+liquidations <- gazette %>% 
+  filter(noticetype %in% insolvency_notices)
+
+#the deduplicate on company number to preserve only one entry per company:
+
+ liquidations <- liquidations[!duplicated(liquidations$compnum),]
+```
+
+
+That's quite small, but we'll leave it there for now.
+```{r gazette split out dates}
+liquidations <- liquidations %>%
+  tidyr::separate(col=pubdate, into=c("day","month","year"), sep=" ")
+names(liquidations)
+```
+## 2. Match to Companies House snapshot
+
+Now we pull in CH data and match to get detailed information for each company in the notices.
+
+```{r companies house import, using February as oldest we have}
+Feb_CH <- fread("BasicCompanyDataAsOneFile-2020-02-01.csv",
+                   select=c(1:13,15,20,27:30,33))
+names(Feb_CH)
+```
+Prep gazette data, some company numbers have lost leading 0s and will fail the match
+```{r prep liquidations for join}
+liquidations$compnum <- gsub("\n          ","",liquidations$compnum)
+
+#some company numbers are missing leading zero in gazette, leaving strings of 7 chars
+#make function to correct this and overwrite vector:
+
+#nb for TRUE/FALSE to work, x=!na
+
+correct_co_num <- function(x,n){
+  if ((!is.na(x)) & (nchar(x)<n)) {
+  stringr::str_pad(x, 8, pad = "0")
+  }
+  else {
+  paste(x)
+  }
+}
+
+liquidations$compnum <- as.vector(unlist(lapply(liquidations$compnum, function(x) correct_co_num(x,8))))
+
+liquidations <- liquidations %>% 
+  rename("CompanyNumber"="compnum")
+
+#check it worked
+#unique(liquidations$compnum)
+```
+
+Now, that function has created one particular entry that's not helpful, which is 	
+00113155 (really JE113155). If we change the zero correction, we lose 00263995 and maybe others. So we'll try removing that rogue entry and a few others before passing unmatches to the api.
+```{r}
+liquidations$CompanyNumber <- gsub("00113155","JE113155",liquidations$CompanyNumber)
+#this one has also 'inherited' a zero falsely in the source data, it was incorrect in gazette:
+liquidations$CompanyNumber <- gsub("011182451","11182451",liquidations$CompanyNumber)
+#these one is also incorrect in the source data
+liquidations$CompanyNumber <- gsub("00196402","01964102",liquidations$CompanyNumber)
+liquidations$CompanyNumber <- gsub("01074011","10714011 ",liquidations$CompanyNumber)
+liquidations$CompanyNumber <- gsub("01168702","11168702 ",liquidations$CompanyNumber)
+```
+
+Now we'll join:
+
+```{r join ch to gazette}
+house_gazette <- inner_join(liquidations, Feb_CH, by="CompanyNumber")
+```
+
+### a) Store unmatched results to match via API
+
+```{r find and store unmatched results}
+unmatched_gazette <- anti_join(liquidations, Feb_CH, by="CompanyNumber")
+write_csv(unmatched_gazette, "unmatched_chg.csv")
+```
+
+## Query the Companies House API - This code is written by Paul Bradshaw:
+
+We can query the [Companies House API](https://developer.companieshouse.gov.uk/api/docs/) for data on each of the companies that don't have a match in the snapshot data. 
+
+A query API for a company looks like this: `https://api.companieshouse.gov.uk/company/09991766` (where the last 8 characters is a company number)
+
+But you also need a key.
+
+You can use command line to query it like so: `curl -uYOURKEYHERE: https://api.companieshouse.gov.uk/company/09991766 > 09991766.json `
+
+That would fetch the data from the specified URL, using the specified key to authenticate (you need to create a key at https://developer.companieshouse.gov.uk/developer/applications), and download it as a file with the specified name. This will be in the location you have navigated to in Terminal (if on a Mac) before running the command.
+
+To replicate that process in R we can use a [package dedicated to querying the Companies House API](https://rdrr.io/github/MatthewSmith430/CompaniesHouse/)
+
+```{r install CompaniesHouse package}
+#If you get an error uncomment the line below
+#install.packages("remotes")
+#remotes::install_github("MatthewSmith430/CompaniesHouse")
+```
+
+First we test it. Note that the key in the code below has been replace so you cannot use mine!:
+
+```{r test query}
+#Store the key - you will need to change this
+mkey <- "TqtcY58VXzY9zh542Qc_2DoxGXKhP5Bn837OMD4b"
+test <- CompaniesHouse::CompanyDataExtract(company_number = "08430745", mkey)
+```
+
+We need to repeat that for each number in the list.
+
+```{r show first few company numbers}
+head(unmatched_gazette$CompanyNumber)
+```
+
+Some are dirty - we get this error when running the code in the loop below:
+
+`[1] "11102916\t" Error: parse error: trailing garbage`
+
+So we need to check for that:
+
+```{DON'T RUN r gsub \t, #moved higher up pre-match}
+unmatched_gazette$CompanyNumber <- gsub("\t","",unmatched_gazette$CompanyNumber)
+```
+
+But `"SC404802 \n"` causes an error too, so we adopt a different approach, using regex to clean them up:
+
+```{r regex clean}
+#regex from https://stackoverflow.com/questions/50697242/input-mask-for-uk-company-registration-number
+#Adapted to factor in OC and remove the $ so it doesn't match to the end
+#We could also add IP for Industrial & Provident Company but would have to have an R on the end and only 5 digits and this includes companies that throw errors
+# see https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/426891/uniformResourceIdentifiersCustomerGuide.pdf
+#Also see these: https://www.doorda.com/glossary/company-number-prefixes/
+compnumclean <- stringr::str_extract(unmatched_gazette$CompanyNumber,"^(SC|NI|OC|IP|[0-9]{2})[0-9]{6}")
+unmatched_gazette$compnumclean <- compnumclean
+#then we have the issue of companies with a (extra info we don't want) after number
+compnumclean <- gsub("\\([A-Z]*?\\)","",compnumclean)
+unmatched_gazette$compnumclean <- compnumclean
+#that still doesn't sort a few so remove manually:
+unmatched_gazette$compnumclean <- gsub("34351172",NA,unmatched_gazette$compnumclean)
+unmatched_gazette$compnumclean <- gsub("00000000",NA,unmatched_gazette$compnumclean)
+unmatched_gazette$compnumclean <- gsub("00108433",NA,unmatched_gazette$compnumclean)
+unmatched_gazette$compnumclean <- gsub("01411078",NA,unmatched_gazette$compnumclean)
+#this one is a charity
+unmatched_gazette$compnumclean <- gsub("01031671",NA,unmatched_gazette$compnumclean)
+```
+
+The problem with this is that it produces some `NA` results, so the code above is adapted until we end up with an NA subset that only contains companies we can't track or are not interested in.
+
+We then store that NA subset separately and whittle down the unmatched companies to those with a matchable company number.
+
+```{r na subset}
+compnumna <- subset(unmatched_gazette, is.na(unmatched_gazette$compnumclean))
+write.csv(compnumna,"compnumna.csv")
+unmatched_gazette.toapi <- subset(unmatched_gazette, !is.na(unmatched_gazette$compnumclean))
+write.csv(compnumna,"compnumna.csv")
+#unmatched_gazette.toapi <- subset(unmatched_gazette.toapi, !is.na(unmatched_gazette.toapi$companyname))
+#check
+unmatched_gazette.toapi <- unmatched_gazette.toapi %>% 
+  filter(!is.na(companyname))
+```
+
+Now we cycle through those and query the API for each.
+
+Because each query is stored as a dataframe, we need to bind them as we go.
+
+The [rate limit is 600 per 5 minute interval](https://developer.companieshouse.gov.uk/api/docs/index/gettingStarted/rateLimiting.html), so we need to stagger it too.
+
+
+```{r loop through numbers & query API}
+#Create an empty dataframe with the right fields
+apimatches <- test[0,]
+#Loop through the cleaned company numbers
+#set a counter
+counter <- 0
+#This list is compiled to avoid all the ones that have no response
+for(i in unmatched_gazette.toapi$compnumclean){
+  #print the number so we can troubleshoot any errors that interrupt it
+  print(i)
+  #Increment the counter and print it
+  counter <- counter+1
+  print(counter)
+  #If it's been 600 queries
+  if(counter == 599){
+    #Sleep for 5 minutes to make sure we are in a new 5 minute period
+    Sys.sleep(300)
+  }
+  #Store the result of querying the API for company details
+  tempfetch <- CompaniesHouse::CompanyDataExtract(company_number = i, mkey)
+  #Add it to the dataframe if that works
+  #tryCatch is used because at least one company number returns an empty result
+  apimatches <- rbind(apimatches,tempfetch) 
+}
+```
+
+## Join onto main database
+
+now we have to join the sets together:
+
+```{r join onto rest of chg matched data}
+#this will have to be for apimatchesall soon but for now, test structure
+#apimatches <- apimatches %>% 
+  #rename(compnumclean=company.number)
+
+apimatches <- apimatches %>% 
+  rename(compnumclean=company.number)
+```
+```{r}
+api_test_join <- inner_join(unmatched_gazette, apimatches, by="compnumclean")
+
+#take out CompanyNumber (old one), replace with compnumclean
+
+api_test_join <- api_test_join %>% 
+  select(-CompanyNumber) %>% 
+  rename(CompanyNumber=compnumclean)
+
+api_prep_join <- api_test_join %>% 
+  select(address=address.x, companyname, CompanyNumber, noticecategory,
+         noticecode, noticetype, day, month, year, typeofliq, url, CompanyName=company.name, 
+         RegAddress.AddressLine1=address.y,RegAddress.PostTown=locality,
+         RegAddress.PostCode=postcode, IncorporationDate=Date.of.Creation,SICCode.SicText_1=sic.code)
+
+#we need to add in three other sic cols or this join will fail
+api_prep_join$SICCode.SicText_2 <- NA
+api_prep_join$SICCode.SicText_3 <- NA
+api_prep_join$SICCode.SicText_4 <- NA
+
+```
+
+```{r}
+house_gazette <- house_gazette %>% 
+  select(names(api_prep_join))
+
+chg_match <- rbind(house_gazette, api_prep_join)
+```
+
+##3. Join to postcode and LA
+
+```{r import postcode to la lookup sheet}
+postcode_to_ons_ref <- fread("NSPL_FEB_2020_UK.csv", select=c("pcds","laua"))
+postcode_to_ons_ref <- postcode_to_ons_ref %>% 
+  rename(postcode=pcds)
+```
+
+then join to house_gazette
+```{r postcode match, create chg_postcode}
+chg_match <- chg_match %>%
+  rename(postcode=RegAddress.PostCode)
+#checking for whitespace error (doesn't seem to be a factor, but for record in case I do need this one day)
+chg_match$postcode <- as.vector(unlist(
+  lapply(chg_match$postcode, function(x) str_trim(x, side = "both"))
+  ))
+```
+```{r}
+#naming convention as ch, companies house, gazette, g 
+chg_postcode <- left_join(chg_match, postcode_to_ons_ref)
+```
+```{r check quality of match}
+find_na <- chg_postcode %>% 
+  filter(is.na(laua))
+#some postcodes are plain awkward, not sure what to do about these, not too many
+```
+
+That's about 135 addresses missing, but not much we can do, except patch them back in by geocoding to la maybe...
+
+Then join code for la onto name and pivot:
+
+```{r create by_district analysis}
+#read in la to ons code lookup
+la_name_to_ons_code <- read_csv("Local_Authority_Districts_December_2019_Names_and_Codes_in_the_United_Kingdom.csv")
+
+#names(la_name_to_ons_code)
+la_name_to_ons_code <- la_name_to_ons_code %>%
+  select(LAD19CD, LAD19NM) %>% 
+  rename("laua"=LAD19CD, "LA_name"=LAD19NM)
+
+#join
+chg_postcode_la <- left_join(chg_postcode, la_name_to_ons_code)
+
+#check for non-matches
+find_na_LA <- chg_postcode_la %>% 
+  filter(is.na(LA_name))
+#another one lost, others are carried nas from postcode match fail
+
+#Pivot by area (but this is not particularly helpful, we need to normalise)
+la_pivot <- chg_postcode_la %>% count(LA_name, sort=T)
+
+#now 
+#note this will overwrite version created by original script 'postcode_matching.R'
+write_csv(la_pivot, "gazette_insolvencies_by_district.csv")
+
+```
+```{r adjust time to single column (time series comparison)}
+library(lubridate)
+
+chg_postcode_la <- chg_postcode_la %>%
+  mutate(date=paste(day, month, year))
+
+chg_postcode_la <- chg_postcode_la %>% 
+  mutate(date = dmy(date))
+```
+
+## 4. Sic match
+
+We want to find the proportion of businesses in each industry group and sic type.
+```{r reshape chg}
+#replace missing code fields with na and remove in melted version, we're fudging it a bit with SicText_1 as this is allowed to be NA and we don't want to lose an insolvency record altogether if there's no sic.
+chg_postcode_la <- chg_postcode_la %>% 
+  naniar::replace_with_na_at(.vars = c("SICCode.SicText_2","SICCode.SicText_3","SICCode.SicText_4"),condition = ~.x %in% "")
+chg_postcode_la <- chg_postcode_la %>%
+  dplyr::mutate(SICCode.SicText_1 = replace_na(SICCode.SicText_1, "None Supplied"))
+
+#we're going to need the wide version later, for time analysis, so write out here:
+write_csv(chg_postcode_la, "chg_postcode_la.csv")
+
+chg_long <- chg_postcode_la %>%
+   pivot_longer(cols=c("SICCode.SicText_1","SICCode.SicText_2","SICCode.SicText_3","SICCode.SicText_4"),
+               names_to = "sicfield",
+               values_to = "siccode",
+               names_prefix = "SICCode.",
+               values_drop_na = TRUE)
+
+chg_long <- chg_long %>%
+  naniar::replace_with_na_at(.vars = c("siccode"),condition = ~.x %in% "")
+
+chg_long <- chg_long %>%
+  tidyr::separate(col=siccode, into=c("siccode","sicdesc"), sep=" - ")
+  
+
+```
+
+Now we can tidy and join onto a 'dictionary' of sic class codes and descriptions
+
+```{r match on sicdivs to make chg_sic}
+chg_long$sicdiv <- substr(chg_long$siccode,1,2)
+#import ref
+sicdivs <- rio::import("sic2007_division_group_class.xls", sheet = 2)
+colnames(sicdivs) <- c("sicdiv","divisionname")
+
+chg_sic <- left_join(chg_long, sicdivs)
+```
+
+This subsection is to match sic division to the more convienient set of 17 broad industry groups (collation of sections)
+We can match from division to broad industry classification using this key:
+```{r import industry_key, match to broad industry group}
+industry_to_div <- industry_to_div <- readxl::read_excel("industry_to_div.xlsx", 
+    sheet = "industry_to_div") %>% 
+  select(-division_top)
+
+industry_to_div$division_bottom <-  str_pad(industry_to_div$division_bottom,2,pad="0")
+industry_to_div <- industry_to_div %>% 
+  rename(sicdiv=division_bottom)
+
+chg_sic <- left_join(chg_sic, industry_to_div)
+```
+```{r clean chg_sic}
+
+chg_sic$typeofliq <- gsub("Ã¢â‚¬â„¢ ","",chg_sic$typeofliq)
+unique(chg_sic$typeofliq)
+```
+Well that's a mess. Let's try package textclean
+
+```{r textclean trial}
+#install.packages("textclean")
+unique(chg_sic$typeofliq)
+textclean::check_text(chg_sic$typeofliq)
+```
+```{r}
+#that suggests replace_non_ascii, let's try:
+chg_sic$typeofliq <- textclean::replace_non_ascii(chg_sic$typeofliq)
+chg_sic$typeofliq <- textclean::replace_emoticon(chg_sic$typeofliq)
+chg_sic$typeofliq <- textclean::replace_white(chg_sic$typeofliq)
+chg_sic$typeofliq <- textclean::strip(chg_sic$typeofliq, apostrophe.remove = TRUE)
+```
+That helped with the ascii issues but there are some weird replacements, a cent etc.
+What we want is to rationalise all entries to NA, Members voluntary, creditors voluntary, members and creditors:
+```{r}
+chg_sic$typeofliq_clean <- textclean::mgsub(chg_sic$typeofliq,c("type of liquidation ","liquidation","a cent","both ","all ","cvl","mvl","in"," s","licensed restaurants","manufacture of lighting equipment","other business support service activities not elsewhere classified","ceditors","mambers","the appointment was made by virtue of paragraph b of schedule b of the insolvency act","the company","voluntarily"),c("","","","","","","","","",NA,NA,NA,"creditors","members","","","voluntary"), fixed=TRUE)
+chg_sic$typeofliq_clean <- textclean::strip(chg_sic$typeofliq_clean)
+textclean::check_text(chg_sic$typeofliq_clean)
+```
+
+```{r write out chg_sic object as it's time-consuming to make}
+write_csv(chg_sic,"chg_sic_clean.csv")
+write_csv(chg_sic,"chg_sic_new.csv")
+```
+
+##5. Sic analysis - new script
+```{r sic analysis by broad industry group}
+
+```
+
+##6. Analysis over time - Completed in separate script
+```{DON'T RUN r collapse chg_sic}
+#note, double counting will occur in long version of around 4500 entries.
+#we have to use the wide version, chg_postcode_la, to be able to summarise by count.
+
+chg_postcode_la <- chg_sic %>% 
+  pivot_wider(names_from = sicfield,
+              values_from = siccode,
+              values_fn = list(siccode = list))
+head(chg_postcode_la)
+```
+```{r taken from postcode_matching.R initial work ref dev}
+#note, double counting will occur in long version of around 4500 entries.
+#we have to use the wide version, chg_postcode_la, to be able to summarise by count.
+
+pivot_mth_yr <- chg_postcode_la %>% 
+  group_by(year, month) %>% 
+  summarise(insolvencies=n()) %>% 
+  arrange(month)
+
+library(ggplot2)
+str(pivot_mth_yr)
+
+pivot_mth_yr$month <- factor(pivot_mth_yr$month, levels= month.name)
+
+grouped_bars <- ggplot(pivot_mth_yr, 
+                       aes(x = month, 
+                           y = insolvencies, 
+                           fill = year)) +
+  geom_bar(stat="identity", position = position_dodge2(preserve = "single")) +
+  geom_hline(yintercept = 0, size = 1, colour="#333333") +
+  scale_fill_manual(values = c("#1380A1", "#FAAB18","#8B0000"))+
+  theme(axis.text.x = element_text(angle = 30, hjust = 1))
+grouped_bars
+```
+
+```{r facetted by industry group?}
+pivot_mth_yr_sic <- chg_sic %>% 
+  group_by(year, month, industry_desc) %>% 
+  summarise(insolvencies=n()) %>% 
+  arrange(month)
+
+pivot_mth_yr_sic $month <- factor(pivot_mth_yr_sic $month, levels= month.name)
+
+grouped_bars <- ggplot(pivot_mth_yr_sic, 
+                       aes(x = month, 
+                           y = insolvencies, 
+                           fill = year)) +
+  facet_wrap(~industry_desc, scales = "free")+
+  geom_bar(stat="identity", position = position_dodge2(preserve = "single")) +
+  geom_hline(yintercept = 0, size = 1, colour="#333333") +
+  scale_fill_manual(values = c("#1380A1", "#FAAB18","#8B0000"))+
+  theme(axis.text.x = element_text(angle = 60, hjust = 1))
+grouped_bars
+```
+
+