From ba22e43828bf593a25099361d012e312b9c89fa5 Mon Sep 17 00:00:00 2001
From: Szilard Parrag <szilard.parrag@gmail.com>
Date: Tue, 23 Apr 2024 16:45:18 +0200
Subject: [PATCH] [pkg/stanza/fileconsumer] Fix long line parsing (#32100)

**Description:**
Flush could have sent partial input before EOF was reached, this PR
fixes it.

**Link to tracking Issue:** #31512, #32170

**Testing:** Added unit test `TestFlushPeriodEOF`

**Documentation:** Added a note to `force_flush_period` option

---------

Signed-off-by: Szilard Parrag <szilard.parrag@axoflow.com>
Co-authored-by: Daniel Jaglowski <jaglows3@gmail.com>
---
 .chloggen/fix_stanza_long_token.yaml          | 27 +++++++++++++++++++
 .../internal/reader/reader_test.go            | 25 +++++++++++++++++
 pkg/stanza/flush/flush.go                     | 14 +++++-----
 receiver/filelogreceiver/README.md            | 20 +++++++-------
 4 files changed, 69 insertions(+), 17 deletions(-)
 create mode 100644 .chloggen/fix_stanza_long_token.yaml

diff --git a/.chloggen/fix_stanza_long_token.yaml b/.chloggen/fix_stanza_long_token.yaml
new file mode 100644
index 000000000000..6f2a7c922f7f
--- /dev/null
+++ b/.chloggen/fix_stanza_long_token.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: bug_fix
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: filelogreceiver
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: When a flush timed out make sure we are at EOF (can't read more)
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [31512, 32170]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
diff --git a/pkg/stanza/fileconsumer/internal/reader/reader_test.go b/pkg/stanza/fileconsumer/internal/reader/reader_test.go
index 8107fe235693..b26bd6a61c1f 100644
--- a/pkg/stanza/fileconsumer/internal/reader/reader_test.go
+++ b/pkg/stanza/fileconsumer/internal/reader/reader_test.go
@@ -6,8 +6,11 @@ package reader
 import (
 	"context"
 	"fmt"
+	"strings"
 	"testing"
+	"time"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
 	"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/internal/filetest"
@@ -183,3 +186,25 @@ func TestFingerprintChangeSize(t *testing.T) {
 		})
 	}
 }
+
+func TestFlushPeriodEOF(t *testing.T) {
+	tempDir := t.TempDir()
+	temp := filetest.OpenTemp(t, tempDir)
+	// Create a long enough initial token, so the scanner can't read the whole file at once
+	aContentLength := 2 * 16 * 1024
+	content := []byte(strings.Repeat("a", aContentLength))
+	content = append(content, '\n', 'b')
+	_, err := temp.WriteString(string(content))
+	require.NoError(t, err)
+
+	// Make sure FlushPeriod is small, so it is guaranteed to expire
+	f, sink := testFactory(t, withFlushPeriod(5*time.Nanosecond))
+	fp, err := f.NewFingerprint(temp)
+	require.NoError(t, err)
+	r, err := f.NewReader(temp, fp)
+	require.NoError(t, err)
+	assert.Equal(t, int64(0), r.Offset)
+
+	r.ReadToEnd(context.Background())
+	sink.ExpectTokens(t, content[0:aContentLength], []byte{'b'})
+}
diff --git a/pkg/stanza/flush/flush.go b/pkg/stanza/flush/flush.go
index 0b239b4aee80..afa03fe92f4d 100644
--- a/pkg/stanza/flush/flush.go
+++ b/pkg/stanza/flush/flush.go
@@ -33,7 +33,6 @@ func (s *State) Func(splitFunc bufio.SplitFunc, period time.Duration) bufio.Spli
 
 	return func(data []byte, atEOF bool) (int, []byte, error) {
 		advance, token, err := splitFunc(data, atEOF)
-
 		// Don't interfere with errors
 		if err != nil {
 			return advance, token, err
@@ -52,6 +51,13 @@ func (s *State) Func(splitFunc bufio.SplitFunc, period time.Duration) bufio.Spli
 			return 0, nil, nil
 		}
 
+		// We're seeing new data so postpone the next flush
+		if len(data) > s.LastDataLength {
+			s.LastDataChange = time.Now()
+			s.LastDataLength = len(data)
+			return 0, nil, nil
+		}
+
 		// Flush timed out
 		if time.Since(s.LastDataChange) > period {
 			s.LastDataChange = time.Now()
@@ -59,12 +65,6 @@ func (s *State) Func(splitFunc bufio.SplitFunc, period time.Duration) bufio.Spli
 			return len(data), data, nil
 		}
 
-		// We're seeing new data so postpone the next flush
-		if len(data) > s.LastDataLength {
-			s.LastDataChange = time.Now()
-			s.LastDataLength = len(data)
-		}
-
 		// Ask for more data
 		return 0, nil, nil
 	}
diff --git a/receiver/filelogreceiver/README.md b/receiver/filelogreceiver/README.md
index e544e3164f0d..67a54629f82e 100644
--- a/receiver/filelogreceiver/README.md
+++ b/receiver/filelogreceiver/README.md
@@ -23,7 +23,7 @@ Tails and parses logs from files.
 | `exclude_older_than`                |                                      | Exclude files whose modification time is older than the specified [age](#time-parameters).                                                                                                                                                                      |
 | `start_at`                          | `end`                                | At startup, where to start reading logs from the file. Options are `beginning` or `end`.                                                                                                                                                                        |
 | `multiline`                         |                                      | A `multiline` configuration block. See [below](#multiline-configuration) for more details.                                                                                                                                                                      |
-| `force_flush_period`                | `500ms`                              | [Time](#time-parameters) since last read of data from file, after which currently buffered log should be send to pipeline. A value of `0` will disable forced flushing.                                                                                         |
+| `force_flush_period`                | `500ms`                              | [Time](#time-parameters) since last time new data was found in the file, after which a partial log at the end of the file may be emitted.|
 | `encoding`                          | `utf-8`                              | The encoding of the file being read. See the list of [supported encodings below](#supported-encodings) for available options.                                                                                                                                   |
 | `preserve_leading_whitespaces`      | `false`                              | Whether to preserve leading whitespaces.                                                                                                                                                                                                                        |
 | `preserve_trailing_whitespaces`     | `false`                              | Whether to preserve trailing whitespaces.                                                                                                                                                                                                                       |
@@ -31,8 +31,8 @@ Tails and parses logs from files.
 | `include_file_path`                 | `false`                              | Whether to add the file path as the attribute `log.file.path`.                                                                                                                                                                                                  |
 | `include_file_name_resolved`        | `false`                              | Whether to add the file name after symlinks resolution as the attribute `log.file.name_resolved`.                                                                                                                                                               |
 | `include_file_path_resolved`        | `false`                              | Whether to add the file path after symlinks resolution as the attribute `log.file.path_resolved`.                                                                                                                                                               |
-| `include_file_owner_name`           | `false`                              | Whether to add the file owner name as the attribute `log.file.owner.name`. Not supported for windows.  |
-| `include_file_owner_group_name`           | `false`                              | Whether to add the file group name as the attribute `log.file.owner.group.name`. Not supported for windows.  |
+| `include_file_owner_name`           | `false`                              | Whether to add the file owner name as the attribute `log.file.owner.name`. Not supported for windows.                                                                                                                                                           |
+| `include_file_owner_group_name`           | `false`                              | Whether to add the file group name as the attribute `log.file.owner.group.name`. Not supported for windows.                                                                                                                                                     |
 | `poll_interval`                     | 200ms                                | The [duration](#time-parameters) between filesystem polls.                                                                                                                                                                                                      |
 | `fingerprint_size`                  | `1kb`                                | The number of bytes with which to identify a file. The first bytes in the file are used as the fingerprint. Decreasing this value at any point will cause existing fingerprints to forgotten, meaning that all files will be read from the beginning (one time) |
 | `max_log_size`                      | `1MiB`                               | The maximum size of a log entry to read. A log entry will be truncated if it is larger than `max_log_size`. Protects against reading large amounts of data into memory.                                                                                         |
@@ -42,20 +42,20 @@ Tails and parses logs from files.
 | `attributes`                        | {}                                   | A map of `key: value` pairs to add to the entry's attributes.                                                                                                                                                                                                   |
 | `resource`                          | {}                                   | A map of `key: value` pairs to add to the entry's resource.                                                                                                                                                                                                     |
 | `operators`                         | []                                   | An array of [operators](../../pkg/stanza/docs/operators/README.md#what-operators-are-available). See below for more details.                                                                                                                                    |
-| `storage`                           | none                                 | The ID of a storage extension to be used to store file offsets. File offsets allow the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver will manage offsets in memory only.  |
+| `storage`                           | none                                 | The ID of a storage extension to be used to store file offsets. File offsets allow the receiver to pick up where it left off in the case of a collector restart. If no storage extension is used, the receiver will manage offsets in memory only.              |
 | `header`                            | nil                                  | Specifies options for parsing header metadata. Requires that the `filelog.allowHeaderMetadataParsing` feature gate is enabled. See below for details. Must be `false` when `start_at` is set to `end`.                                                          |
 | `header.pattern`                    | required for header metadata parsing | A regex that matches every header line.                                                                                                                                                                                                                         |
 | `header.metadata_operators`         | required for header metadata parsing | A list of operators used to parse metadata from the header.                                                                                                                                                                                                     |
 | `retry_on_failure.enabled`          | `false`                              | If `true`, the receiver will pause reading a file and attempt to resend the current batch of logs if it encounters an error from downstream components.                                                                                                         |
 | `retry_on_failure.initial_interval` | `1s`                                 | [Time](#time-parameters) to wait after the first failure before retrying.                                                                                                                                                                                       |
 | `retry_on_failure.max_interval`     | `30s`                                | Upper bound on retry backoff [interval](#time-parameters). Once this value is reached the delay between consecutive retries will remain constant at the specified value.                                                                                        |
-| `retry_on_failure.max_elapsed_time` | `5m`                                 | Maximum amount of [time](#time-parameters) (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to `0`.     
-| `ordering_criteria.regex`     |                                      | Regular expression used for sorting, should contain a named capture groups that are to be used in `regex_key`.                                                                                                                               |
-| `ordering_criteria.top_n`     | 1 | The number of files to track when using file ordering. The top N files are tracked after applying the ordering criteria. |
-| `ordering_criteria.sort_by.sort_type` |                                      | Type of sorting to be performed (e.g., `numeric`, `alphabetical`, `timestamp`, `mtime`)                                                                                                                                                                                  |
+| `retry_on_failure.max_elapsed_time` | `5m`                                 | Maximum amount of [time](#time-parameters) (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to `0`.                                               
+| `ordering_criteria.regex`     |                                      | Regular expression used for sorting, should contain a named capture groups that are to be used in `regex_key`.                                                                                                                                                  |
+| `ordering_criteria.top_n`     | 1 | The number of files to track when using file ordering. The top N files are tracked after applying the ordering criteria.                                                                                                                                        |
+| `ordering_criteria.sort_by.sort_type` |                                      | Type of sorting to be performed (e.g., `numeric`, `alphabetical`, `timestamp`, `mtime`)                                                                                                                                                                         |
 | `ordering_criteria.sort_by.location`  |                                      | Relevant if `sort_type` is set to `timestamp`. Defines the location of the timestamp of the file.                                                                                                                                                               |
 | `ordering_criteria.sort_by.format`    |                                      | Relevant if `sort_type` is set to `timestamp`. Defines the strptime format of the timestamp being sorted.                                                                                                                                                       |
-| `ordering_criteria.sort_by.ascending` |                                      | Sort direction                                              |
+| `ordering_criteria.sort_by.ascending` |                                      | Sort direction                                                                                                                                                                                                                                                  |
 
 Note that _by default_, no logs will be read from a file that is not actively being written to because `start_at` defaults to `end`.
 
@@ -193,4 +193,4 @@ Here is some of the information the `filelog` receiver stores:
     `filelog` receiver continues reading the file (`Offset`).
   - An arbitrary set of file attributes, such as the name of the file (`FileAttributes`).
 
-Exactly how this information is serialized depends on the type of storage being used.
\ No newline at end of file
+Exactly how this information is serialized depends on the type of storage being used.