Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Atualiza estratégia de download dos arquivos da Receita Federal #235

Merged
merged 2 commits into from
Aug 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/gofmt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: "1.21"
go-version: "1.22"
- run: if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then exit 1; fi
4 changes: 2 additions & 2 deletions .github/workflows/golint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: "1.21.x"
- run: "go install honnef.co/go/tools/cmd/staticcheck@2023.1.6"
go-version: "1.22.x"
- run: "go install honnef.co/go/tools/cmd/staticcheck@v0.5.1"
- run: "staticcheck ./..."
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest]
go: [1.20.x, 1.21.x]
go: [1.21.x, 1.22.x]

runs-on: ${{ matrix.os }}

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.21-bookworm AS build
FROM golang:1.22-bookworm AS build
WORKDIR /minha-receita
COPY go.mod .
COPY go.sum .
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
- ./data:/mnt/data

postgres:
image: postgres:14-alpine
image: postgres:16.1-bookworm
ports:
- 5432:5432
volumes:
Expand All @@ -26,7 +26,7 @@ services:
retries: 5

postgres_test:
image: postgres:14-alpine
image: postgres:16.1-bookworm
ports:
- 5555:5432
environment: *credentials
Expand Down
2 changes: 1 addition & 1 deletion docs/instalacao.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ $ docker pull ghcr.io/cuducos/minha-receita:main

#### A partir do código fonte

* [Go](https://golang.org/) versão 1.21
* [Go](https://golang.org/) versão 1.22

Depois de clonar o repositório, baixe as dependências e compile a aplicação para um diretório incluído no `PATH`, por exemplo:

Expand Down
13 changes: 8 additions & 5 deletions download/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ import (
"time"
)

type getURLsHandler func(url, dir string) ([]string, error)
type getURLsHandler func(url string) ([]string, error)

func getURLs(url string, handler getURLsHandler, dir string, skip bool) ([]string, error) {
urls, err := handler(url, dir)
urls, err := handler(url)
if err != nil {
return nil, fmt.Errorf("error getting urls: %w", err)
}
Expand Down Expand Up @@ -72,6 +72,9 @@ func Download(dir string, timeout time.Duration, skip, restart bool, parallel in
if err := download(dir, urls, parallel, retries, chunkSize, timeout, restart); err != nil {
return fmt.Errorf("error downloading files from the federal revenue: %w", err)
}
if err := federalRevenueGetMetadata(federalRevenueMetadataURL, dir); err != nil {
return fmt.Errorf("error getting metadata: %w", err)
}
return nil
}

Expand All @@ -90,7 +93,7 @@ func DownloadFromMirror(mirror string, dir string, timeout time.Duration, skip,
// URLs shows the URLs to be downloaded.
func URLs(dir string, skip bool) error {
urls := []string{federalRevenueURL, nationalTreasureBaseURL}
handlers := []getURLsHandler{federalRevenueGetURLsNoUpdatedAt, nationalTreasureGetURLs}
handlers := []getURLsHandler{federalRevenueGetURLs, nationalTreasureGetURLs}
var out []string
for idx := range urls {
u, err := getURLs(urls[idx], handlers[idx], dir, skip)
Expand All @@ -106,7 +109,7 @@ func URLs(dir string, skip bool) error {

// UpdatedAt shows the updated at of the files to be downloaded.
func UpdatedAt() error {
u, err := fetchUpdatedAt(federalRevenueURL)
u, err := fetchUpdatedAt(federalRevenueMetadataURL)
if err != nil {
return fmt.Errorf("error getting updated at: %w", err)
}
Expand All @@ -116,7 +119,7 @@ func UpdatedAt() error {

// HasUpdate checks if there is an update available.
func HasUpdate(dir string) error {
h, err := hasUpdate(federalRevenueURL, dir)
h, err := hasUpdate(federalRevenueMetadataURL, dir)
if err != nil {
return fmt.Errorf("error getting updated at: %w", err)
}
Expand Down
19 changes: 13 additions & 6 deletions download/download_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@ import (
"net/http/httptest"
"os"
"path"
"sync/atomic"
"testing"
)

func TestGetURLs(t *testing.T) {
for _, tc := range []struct {
name string
fixture string
fixture []string
handler getURLsHandler
expected int
}{
{"federal revenue", "cadastro-nacional-de-pessoa-juridica-cnpj.json", federalRevenueGetURLs, 37},
{"national treasure", "national-treasure.json", nationalTreasureGetURLs, 1},
{"federal revenue", []string{"dados_abertos_cnpj.html", "2024-08.html"}, federalRevenueGetURLs, 37},
{"national treasure", []string{"national-treasure.json"}, nationalTreasureGetURLs, 1},
} {
ts := httpTestServer(t, tc.fixture)
defer ts.Close()
Expand Down Expand Up @@ -47,16 +48,22 @@ func loadFixture(t *testing.T, n string) (*os.File, int64) {
return f, i.Size()
}

func httpTestServer(t *testing.T, n string) *httptest.Server {
func httpTestServer(t *testing.T, cs []string) *httptest.Server {
if len(cs) == 0 {
panic("no content provided to the test server")
}
var c uint32
return httptest.NewServer(
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
idx := int(atomic.LoadUint32(&c)) % len(cs)
atomic.AddUint32(&c, 1)
if r.Method == http.MethodHead {
f, s := loadFixture(t, n)
f, s := loadFixture(t, cs[idx])
defer f.Close()
w.Header().Add("Content-Length", fmt.Sprint(s))
return
}
http.ServeFile(w, r, path.Join("..", "testdata", n))
http.ServeFile(w, r, path.Join("..", "testdata", cs[idx]))
}))
}

Expand Down
2 changes: 1 addition & 1 deletion download/downloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
)

func TestDownloader(t *testing.T) {
ts := httpTestServer(t, "cadastro-nacional-de-pessoa-juridica-cnpj.json")
ts := httpTestServer(t, []string{"cadastro-nacional-de-pessoa-juridica-cnpj.json"})
defer ts.Close()

f, s := loadFixture(t, "cadastro-nacional-de-pessoa-juridica-cnpj.json")
Expand Down
111 changes: 78 additions & 33 deletions download/federal_revenue.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,84 @@ import (
"os"
"path/filepath"
"regexp"
"slices"
"strings"
"time"
)

const (
userAgent = "Minha Receita/0.0.1 (minhareceita.org)"

// FederalRevenueUpdatedAt is a file that contains the date the data was
// extracted by the Federal Revenue
FederalRevenueUpdatedAt = "updated_at.txt"

federalRevenueURL = "https://dados.gov.br/api/publico/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj"
federalRevenueFormat = "zip+csv"
// Metadata source
federalRevenueMetadataURL = "https://dados.gov.br/api/publico/conjuntos-dados/cadastro-nacional-da-pessoa-juridica---cnpj"
federalRevenueDateFormat = "02/01/2006 15:04:05"
federalRevenueDateFormatNotes = "02/01/2006"

userAgent = "Minha Receita/0.0.1 (minhareceita.org)"
// Zipped CSV source
federalRevenueURL = "https://dadosabertos.rfb.gov.br/CNPJ/dados_abertos_cnpj"
)

var datePattern = regexp.MustCompile(`Data da última extração:? +(?P<updatedAt>\d{2}/\d{2}/\d{4})`)
var yearMonthPattern = regexp.MustCompile(`href="(\d{4}-\d{2}/)"`)
var filePattern = regexp.MustCompile(`href="(\w+\d?\.zip)"`)

func httpGet(url string) (string, error) {
c := http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", fmt.Errorf("error creating request %s: %w", url, err)
}
req.Header.Set("User-Agent", userAgent)
r, err := c.Do(req)
if err != nil {
return "", fmt.Errorf("error getting %s: %w", url, err)
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
return "", fmt.Errorf("%s responded with %s", url, r.Status)
}
b, err := io.ReadAll(r.Body)
if err != nil {
return "", fmt.Errorf("could not read %s response body: %w", url, err)
}
return string(b), nil
}

func federalRevenueGetMostRecentURL(url string) (string, error) {
b, err := httpGet(url)
if err != nil {
return "", fmt.Errorf("error getting %s: %w", url, err)
}
var bs []string
for _, m := range yearMonthPattern.FindAllStringSubmatch(b, -1) {
bs = append(bs, m[1])
}
slices.Sort(bs)
if len(bs) == 0 {
return "", fmt.Errorf("no batches found in %s", url)
}
return url + "/" + bs[len(bs)-1], nil
}

func federalRevenueGetURLs(url string) ([]string, error) {
u, err := federalRevenueGetMostRecentURL(url)
if err != nil {
return nil, fmt.Errorf("could not read %s response body: %w", url, err)
}
b, err := httpGet(u)
if err != nil {
return nil, fmt.Errorf("error getting %s: %w", url, err)
}
var urls []string
for _, m := range filePattern.FindAllStringSubmatch(b, -1) {
urls = append(urls, u+m[1])
}
return urls, nil
}

type federalRevenueTime struct{ Time time.Time }

Expand All @@ -44,18 +104,18 @@ func (t *federalRevenueTime) UnmarshalJSON(b []byte) error {
return nil
}

type federalRevenueResource struct {
type federalRevenueMetadataResource struct {
Format string `json:"format"`
URL string `json:"url"`
MetadataModified federalRevenueTime `json:"metadata_modified"`
}

type federalRevenueResponse struct {
Resources []federalRevenueResource `json:"resources"`
Notes string `json:"notes"`
type federalRevenueMetadata struct {
Resources []federalRevenueMetadataResource `json:"resources"`
Notes string `json:"notes"`
}

func (r *federalRevenueResponse) updatedAt() (t time.Time) {
func (r *federalRevenueMetadata) updatedAt() (t time.Time) {
m := datePattern.FindStringSubmatch(r.Notes)
if len(m) == 2 {
t, err := time.Parse(federalRevenueDateFormatNotes, m[1])
Expand All @@ -72,7 +132,7 @@ func (r *federalRevenueResponse) updatedAt() (t time.Time) {
return t
}

func newFederalRevenueResponse(url string) (*federalRevenueResponse, error) {
func newFederalRevenueMetadata(url string) (*federalRevenueMetadata, error) {
c := http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
Expand All @@ -91,42 +151,27 @@ func newFederalRevenueResponse(url string) (*federalRevenueResponse, error) {
if err != nil {
return nil, fmt.Errorf("could not read %s response body: %w", url, err)
}
var data federalRevenueResponse
var data federalRevenueMetadata
if err := json.Unmarshal(b, &data); err != nil {
return nil, fmt.Errorf("could not unmarshal %s json response: %w", url, err)
}
return &data, nil
}

func federalRevenueGetURLsBase(url, dir string, updatedAt bool) ([]string, error) {
data, err := newFederalRevenueResponse(url)
func federalRevenueGetMetadata(url, dir string) error {
data, err := newFederalRevenueMetadata(url)
if err != nil {
return nil, fmt.Errorf("error getting federal revenue data: %w", err)
}
var u []string
for _, v := range data.Resources {
if v.Format == federalRevenueFormat {
u = append(u, v.URL)
}
}
if updatedAt {
if err := saveUpdatedAt(dir, data.updatedAt()); err != nil {
return nil, fmt.Errorf("could not save the update at date: %w", err)
}
return fmt.Errorf("error getting federal revenue data: %w", err)
}
return u, nil
}

func federalRevenueGetURLs(url, dir string) ([]string, error) {
return federalRevenueGetURLsBase(url, dir, true)
}
if err := saveUpdatedAt(dir, data.updatedAt()); err != nil {
return fmt.Errorf("could not save the update at date: %w", err)

func federalRevenueGetURLsNoUpdatedAt(url, dir string) ([]string, error) {
return federalRevenueGetURLsBase(url, dir, false)
}
return nil
}

func fetchUpdatedAt(url string) (string, error) {
data, err := newFederalRevenueResponse(url)
data, err := newFederalRevenueMetadata(url)
if err != nil {
return "", fmt.Errorf("error getting federal revenue data: %w", err)
}
Expand Down
Loading
Loading