Skip to content

Commit

Permalink
TSPROJ-7446:do not abort if dcr failure on a single node
Browse files Browse the repository at this point in the history
TSPROJ-7490:Added troubleshooting guide for common scenarios
TSPROJ-7489:Try to guess mongod log paths when they are relative
TSPROJ-7488:abort if FS free space below one 1GB
hasaketa committed Oct 17, 2024
1 parent 60b0f2b commit cb5d26a
Showing 5 changed files with 147 additions and 15 deletions.
22 changes: 22 additions & 0 deletions TROUBLESHOOTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
### Description
This documents helps with common troubleshooting scenarios

### Troubleshooting Remote Copy of Mongod Logs and FTDC Failures due to Known Hosts Error

* **Understanding the `known_hosts` file**: The `known_hosts` file is a security feature used by SSH (Secure Shell) to verify the identity of remote hosts. It stores the public keys of all known hosts, which are used to authenticate connections.
* **Step 1: Verify the `known_hosts` file**
+ Ensure that the `known_hosts` file is properly populated with the nodes you're trying to connect to.
+ Check if the file exists in the default location (`~/.ssh/known_hosts`) and contains the public keys of all expected hosts.
* **Step 2: Configure SSH to skip known hosts checking (Workaround)**
+ Edit the `~/.ssh/config` file using a text editor (e.g., `nano` or `vim`).
+ Add the following lines to the end of the file:
```bash
Host *
UserKnownHostsFile /dev/null
StrictHostKeyChecking no
```

* **Step 3: Test the connection:** Try running ssh on remote nodes to see that it does not prompt for known_hosts.

Note: This workaround disables strict host key checking, which may compromise security. It's recommended to properly populate the `known_hosts` file or use a more secure alternative solution.

58 changes: 46 additions & 12 deletions main.go
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@ import (
"net"
"os"
"strconv"
"syscall"
"time"

"github.com/briandowns/spinner"
@@ -71,7 +72,7 @@ func main() {
err = cred.Get(&dcrlog)
if err != nil {
dcrlog.Error(err.Error())
log.Fatal("Aborting")
log.Fatal("Error why getting DB credentials aborting!")
}

remoteCred := fscopy.RemoteCred{}
@@ -108,13 +109,25 @@ func main() {
err = clustertopology.GetAllNodes()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error in Topology finding: %s", err.Error()))
log.Fatal("Error in Topology finding:", err)
log.Fatal("Error in Topology finding cannot proceed aborting:", err)
}

for _, host := range clustertopology.Allnodes.Nodes {

dcrlog.Info(fmt.Sprintf("host: %s, port: %d", host.Hostname, host.Port))

//determine if the data collection should abort due to not enough free space
//we keep approx 1GB as limit
fsHasFreeSpace, err := hasFreeSpace()
if err != nil {
dcrlog.Warn("Warning cannot check free space for data collection.")
fmt.Println("WARNING: Cannot check free space for data collection monitor free space e.g. df -h output")
} else {
if !fsHasFreeSpace {
log.Fatal("aborting because not enough free space for data collection to continue")
}
}

cred.Currentmongodhost = host.Hostname
cred.Currentmongodport = strconv.Itoa(host.Port)
cred.SetMongoURI()
@@ -132,10 +145,10 @@ func main() {
c.Outputdir = &outputdir

dcrlog.Info("Running getMongoData/mongoWellnessChecker")
err := c.RunMongoShellWithEval()
err = c.RunMongoShellWithEval()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error Running getMongoData %v", err))
log.Fatal("Error Running getMongoData ", err)
//log.Fatal("Error Running getMongoData ", err)
}

isLocalHost := false
@@ -146,11 +159,11 @@ func main() {
if errtest != nil {
dcrlog.Error(
fmt.Sprintf(
"Error determining if Hostname is a LocalHost or not : %v",
"Error determining if Hostname is a LocalHost or not. Assuming Remote node: %v",
errtest,
),
)
log.Fatal("Error determining if Hostname is a LocalHost or not :", errtest)
//log.Fatal("Error determining if Hostname is a LocalHost or not :", errtest)
}

if isLocalHost {
@@ -165,7 +178,7 @@ func main() {
err = ftdcarchive.Start()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error in FTDCArchive: %v", err))
log.Fatal("Error in FTDCArchive: ", err)
//log.Fatal("Error in FTDCArchive: ", err)
}

dcrlog.Info("Running mongo log Archiving")
@@ -175,7 +188,7 @@ func main() {
err = logarchive.Start()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error in LogArchive: %v", err))
log.Fatal("Error in LogArchive:", err)
//log.Fatal("Error in LogArchive:", err)
}

} else {
@@ -216,8 +229,8 @@ func main() {

err = remoteFTDCArchiver.Start()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error in Remote FTDC Archive: %v", err))
log.Fatal("Error in Remote FTDC Archive: ", err)
dcrlog.Error(fmt.Sprintf("Error in Remote FTDC Archive for this node: %v", err))
//log.Fatal("Error in Remote FTDC Archive: ", err)
}

remotecopyJob.Output.Reset()
@@ -234,8 +247,8 @@ func main() {

err = remoteLogArchiver.Start()
if err != nil {
dcrlog.Error(fmt.Sprintf("Error in Remote Log Archive: %v", err))
log.Fatal("Error in Remote Log Archive: ", err)
dcrlog.Error(fmt.Sprintf("Error in Remote Log Archive for this node: %v", err))
//log.Fatal("Error in Remote Log Archive: ", err)
}
}

@@ -249,6 +262,27 @@ func main() {
dcrlog.Info("---End of Script Execution----")
}

func hasFreeSpace() (bool, error) {
processwd, err := os.Getwd()
if err != nil {
return false, err
}

var fsstat syscall.Statfs_t
if err := syscall.Statfs(processwd, &fsstat); err != nil {
return false, err
}

freeSpaceOnFSInGB := float64(fsstat.Bavail*uint64(fsstat.Bsize)) / (1024 * 1024 * 1024)

if freeSpaceOnFSInGB < 1.1 {
return false, nil
}

return true, nil

}

func getListOfHostIPsForHostname(hostname string) ([]net.IP, error) {
listOfhostIPsForHostname, err := net.LookupIP(hostname)
if err != nil {
59 changes: 59 additions & 0 deletions mongologarchiver/logpath.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright 2020 MongoDB Inc
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mongologarchiver

import (
"fmt"
"strings"
)

type LogPath struct {
DiagDirPath string
CurrentLogPath string
PreparedLogPath string
}

func (lp *LogPath) ProcessLogPath() {

lp.PreparedLogPath = lp.CurrentLogPath
if lp.logPathStartsWithDotSlash() && lp.DiagDirPath != "" {
lp.PreparedLogPath = lp.logPathWithBestEstimatedParent()
}

}

func (lp *LogPath) logPathWithBestEstimatedParent() string {

//remove dot slash prefix from logPath
//extract from logpath the dirname upto first slash
//it could also not be a dir example if logPath was ./mongod.log
logPathFirstPath := strings.Split(lp.CurrentLogPath[2:], "/")[0]

parentPath := []string{}

for _, ddpath := range strings.Split(lp.DiagDirPath[1:len(lp.DiagDirPath)-1], "/") {
if ddpath == logPathFirstPath {
break
}
parentPath = append(parentPath, ddpath)
}

return fmt.Sprintf("/%s", strings.Join(parentPath, "/")) + "/" + logPathFirstPath

}

func (lp *LogPath) logPathStartsWithDotSlash() bool {
return strings.HasPrefix(lp.CurrentLogPath, "./")
}
19 changes: 18 additions & 1 deletion mongologarchiver/mongologarchiver.go
Original file line number Diff line number Diff line change
@@ -36,6 +36,16 @@ type MongoDLogarchive struct {
Outputdir *dcroutdir.DCROutputDir
}

func (la *MongoDLogarchive) getDiagnosticDataDirPath() string {
err := la.Mongo.RunGetCommandDiagnosticDataCollectionDirectoryPath()
if err != nil {
fmt.Printf("Error in getDiagnosticDataDirPath: %v", err)
return ""
}

return trimQuote(la.Mongo.Getparsedjsonoutput.String())
}

func (la *MongoDLogarchive) getLogPath() error {
err := la.Mongo.RunGetMongoDLogDetails()
if err != nil {
@@ -49,7 +59,14 @@ func (la *MongoDLogarchive) getLogPath() error {
}
if systemLogOutput["destination"] == "file" {
la.LogDestination = "file"
la.LogPath = trimQuote(systemLogOutput["path"].(string))

lp := LogPath{}

lp.CurrentLogPath = trimQuote(systemLogOutput["path"].(string))
lp.DiagDirPath = la.getDiagnosticDataDirPath()
lp.ProcessLogPath()
la.LogPath = lp.PreparedLogPath

la.LogDir = filepath.Dir(la.LogPath)
la.CurrentLogFileName = filepath.Base(la.LogPath)
// fmt.Println("The mongod log file path is: ", la.LogDir)
4 changes: 2 additions & 2 deletions topologyfinder/topologyfinder.go
Original file line number Diff line number Diff line change
@@ -84,7 +84,7 @@ func (tf *TopologyFinder) parseHelloOutput() error {
var hostsArray []string

if err := json.Unmarshal(tf.GetHelloOutput.Bytes(), &hostsArray); err != nil {
log.Fatal(err)
log.Fatalf("Error parsing hello output during topology discovery: %s", err)
}

for _, mongonodestring := range hostsArray {
@@ -121,7 +121,7 @@ func (tf *TopologyFinder) parseShardMapOutput() error {
var shardMap map[string]interface{}

if err := json.Unmarshal(tf.GetShardMapOutput.Bytes(), &shardMap); err != nil {
log.Fatal(err)
log.Fatalf("Error parsing shardmap output: %s", err)
}

allhosts, ok := shardMap["hosts"].(map[string]interface{})

0 comments on commit cb5d26a

Please sign in to comment.