Skip to content

Assembler: clone-all from link-index registry #1311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Runtime.Serialization;
using YamlDotNet.Serialization;

namespace Elastic.Documentation.Configuration.Assembler;
Expand All @@ -12,6 +13,14 @@ public record NarrativeRepository : Repository
public override string Name { get; set; } = RepositoryName;
}

public enum CheckoutStrategy
{
[EnumMember(Value = "partial")]
Partial,
[EnumMember(Value = "full")]
Full
}

public record Repository
{
[YamlIgnore]
Expand All @@ -27,7 +36,7 @@ public record Repository
public string GitReferenceNext { get; set; } = "main";

[YamlMember(Alias = "checkout_strategy")]
public string CheckoutStrategy { get; set; } = "partial";
public CheckoutStrategy CheckoutStrategy { get; set; } = CheckoutStrategy.Partial;

[YamlMember(Alias = "skip")]
public bool Skip { get; set; }
Expand Down
11 changes: 7 additions & 4 deletions src/tooling/docs-assembler/Cli/RepositoryCommands.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
// See the LICENSE file in the project root for more information

using System.Collections.Concurrent;
using System.ComponentModel;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO.Abstractions;
using System.Net.Mime;
using Actions.Core.Services;
Expand Down Expand Up @@ -39,11 +41,13 @@ private void AssignOutputLogger()
/// <summary> Clones all repositories </summary>
/// <param name="strict"> Treat warnings as errors and fail the build on warnings</param>
/// <param name="environment"> The environment to build</param>
/// <param name="fetchLatest"> If true fetch the latest commit of the branch instead of the link registry entry ref</param>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whats our use-case for this? I like the flag but just wondering :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For local QAing, if you specify a different branch for a repo e.g. feature/new-content, you want to be able to fetch the latest.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general I wanted to keep the old behaviour. Just in case.

/// <param name="ctx"></param>
[Command("clone-all")]
public async Task<int> CloneAll(
bool? strict = null,
string? environment = null,
bool? fetchLatest = null,
Cancel ctx = default
)
{
Expand All @@ -55,7 +59,8 @@ public async Task<int> CloneAll(

var assembleContext = new AssembleContext(environment, collector, new FileSystem(), new FileSystem(), null, null);
var cloner = new AssemblerRepositorySourcer(logger, assembleContext);
_ = await cloner.AcquireAllLatest(ctx);

_ = await cloner.CloneAll(fetchLatest ?? false, ctx);

await collector.StopAsync(ctx);

Expand Down Expand Up @@ -138,7 +143,6 @@ public async Task<int> UpdateLinkIndexAll(ContentSource contentSource, Cancel ct
// It's only used to get the list of repositories.
var assembleContext = new AssembleContext("prod", collector, new FileSystem(), new FileSystem(), null, null);
var cloner = new RepositorySourcer(logger, assembleContext.CheckoutDirectory, new FileSystem(), collector);
var dict = new ConcurrentDictionary<string, Stopwatch>();
var repositories = new Dictionary<string, Repository>(assembleContext.Configuration.ReferenceRepositories)
{
{ NarrativeRepository.RepositoryName, assembleContext.Configuration.Narrative }
Expand All @@ -152,8 +156,7 @@ await Parallel.ForEachAsync(repositories,
{
try
{
var name = kv.Key.Trim();
var checkout = cloner.CloneOrUpdateRepository(kv.Value, name, kv.Value.GetBranch(contentSource), dict);
var checkout = cloner.CloneRef(kv.Value, kv.Value.GetBranch(contentSource), true);
var outputPath = Directory.CreateTempSubdirectory(checkout.Repository.Name).FullName;
var context = new BuildContext(
collector,
Expand Down
192 changes: 116 additions & 76 deletions src/tooling/docs-assembler/Sourcing/RepositorySourcesFetcher.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.IO.Abstractions;
using Elastic.Documentation.Configuration.Assembler;
using Elastic.Documentation.Diagnostics;
using Elastic.Documentation.LinkIndex;
using Elastic.Markdown.IO;
using Microsoft.Extensions.Logging;
using ProcNet;
Expand Down Expand Up @@ -46,129 +47,169 @@ public IReadOnlyCollection<Checkout> GetAll()
return checkouts;
}

public async Task<IReadOnlyCollection<Checkout>> AcquireAllLatest(Cancel ctx = default)
public async Task<IReadOnlyCollection<Checkout>> CloneAll(bool fetchLatest, Cancel ctx = default)
{
_logger.LogInformation(
"Cloning all repositories for environment {EnvironmentName} using '{ContentSourceStrategy}' content sourcing strategy",
_logger.LogInformation("Cloning all repositories for environment {EnvironmentName} using '{ContentSourceStrategy}' content sourcing strategy",
PublishEnvironment.Name,
PublishEnvironment.ContentSource.ToStringFast(true)
);
var checkouts = new ConcurrentBag<Checkout>();

ILinkIndexReader linkIndexReader = Aws3LinkIndexReader.CreateAnonymous();
var linkRegistry = await linkIndexReader.GetRegistry(ctx);

var repositories = new Dictionary<string, Repository>(Configuration.ReferenceRepositories)
{
{ NarrativeRepository.RepositoryName, Configuration.Narrative }
};
return await RepositorySourcer.AcquireAllLatest(repositories, PublishEnvironment.ContentSource, ctx);
}
}

public class RepositorySourcer(ILoggerFactory logger, IDirectoryInfo checkoutDirectory, IFileSystem readFileSystem, DiagnosticsCollector collector)
{
private readonly ILogger<RepositorySourcer> _logger = logger.CreateLogger<RepositorySourcer>();

public async Task<IReadOnlyCollection<Checkout>> AcquireAllLatest(Dictionary<string, Repository> repositories, ContentSource source, Cancel ctx = default)
{
var dict = new ConcurrentDictionary<string, Stopwatch>();
var checkouts = new ConcurrentBag<Checkout>();
await Parallel.ForEachAsync(repositories,
new ParallelOptions
{
CancellationToken = ctx,
MaxDegreeOfParallelism = Environment.ProcessorCount
}, async (kv, c) =>
}, async (repo, c) =>
{
await Task.Run(() =>
{
var name = kv.Key.Trim();
var repo = kv.Value;
var clone = CloneOrUpdateRepository(kv.Value, name, repo.GetBranch(source), dict);
checkouts.Add(clone);
if (!linkRegistry.Repositories.TryGetValue(repo.Key, out var entry))
{
context.Collector.EmitError("", $"'{repo.Key}' does not exist in link index");
return;
}
var branch = repo.Value.GetBranch(PublishEnvironment.ContentSource);
var gitRef = branch;
if (!fetchLatest)
{
if (!entry.TryGetValue(branch, out var entryInfo))
{
context.Collector.EmitError("", $"'{repo.Key}' does not have a '{branch}' entry in link index");
return;
}
gitRef = entryInfo.GitReference;
}
checkouts.Add(RepositorySourcer.CloneRef(repo.Value, gitRef, fetchLatest));
}, c);
}).ConfigureAwait(false);

return checkouts.ToList().AsReadOnly();
return checkouts;
}
}

public Checkout CloneOrUpdateRepository(Repository repository, string name, string branch, ConcurrentDictionary<string, Stopwatch> dict)
{
var fs = readFileSystem;
var checkoutFolder = fs.DirectoryInfo.New(Path.Combine(checkoutDirectory.FullName, name));
var relativePath = Path.GetRelativePath(Paths.WorkingDirectoryRoot.FullName, checkoutFolder.FullName);
var sw = Stopwatch.StartNew();

_ = dict.AddOrUpdate($"{name} ({branch})", sw, (_, _) => sw);
public class RepositorySourcer(ILoggerFactory logger, IDirectoryInfo checkoutDirectory, IFileSystem readFileSystem, DiagnosticsCollector collector)
{
private readonly ILogger<RepositorySourcer> _logger = logger.CreateLogger<RepositorySourcer>();

string? head;
if (checkoutFolder.Exists)
// <summary>
// Clones the repository to the checkout directory and checks out the specified git reference.
// </summary>
// <param name="repository">The repository to clone.</param>
// <param name="gitRef">The git reference to check out. Branch, commit or tag</param>
public Checkout CloneRef(Repository repository, string gitRef, bool pull = false, int attempt = 1)
{
var checkoutFolder = readFileSystem.DirectoryInfo.New(Path.Combine(checkoutDirectory.FullName, repository.Name));
if (attempt > 3)
{
if (!TryUpdateSource(name, branch, relativePath, checkoutFolder, out head))
head = CheckoutFromScratch(repository, name, branch, relativePath, checkoutFolder);
collector.EmitError("", $"Failed to clone repository {repository.Name}@{gitRef} after 3 attempts");
return new Checkout
{
Directory = checkoutFolder,
HeadReference = "",
Repository = repository,
};
}
else
head = CheckoutFromScratch(repository, name, branch, relativePath, checkoutFolder);

sw.Stop();

return new Checkout
_logger.LogInformation("{RepositoryName}: Cloning repository {RepositoryName}@{Commit} to {CheckoutFolder}", repository.Name, repository.Name, gitRef,
checkoutFolder.FullName);
if (!checkoutFolder.Exists)
{
Repository = repository,
Directory = checkoutFolder,
HeadReference = head
};
}

private bool TryUpdateSource(string name, string branch, string relativePath, IDirectoryInfo checkoutFolder, [NotNullWhen(true)] out string? head)
{
head = null;
try
checkoutFolder.Create();
checkoutFolder.Refresh();
}
var isGitInitialized = GitInit(repository, checkoutFolder);
string? head = null;
if (isGitInitialized)
{
_logger.LogInformation("Pull: {Name}\t{Branch}\t{RelativePath}", name, branch, relativePath);
// --allow-unrelated-histories due to shallow clones not finding a common ancestor
ExecIn(checkoutFolder, "git", "pull", "--depth", "1", "--allow-unrelated-histories", "--no-ff");
try
{
head = Capture(checkoutFolder, "git", "rev-parse", "HEAD");
}
catch (Exception e)
{
_logger.LogError(e, "{RepositoryName}: Failed to acquire current commit, falling back to recreating from scratch", repository.Name);
checkoutFolder.Delete(true);
checkoutFolder.Refresh();
return CloneRef(repository, gitRef, pull, attempt + 1);
}
}
catch (Exception e)
// Repository already checked out the same commit
if (head != null && head == gitRef)
// nothing to do, already at the right commit
_logger.LogInformation("{RepositoryName}: HEAD already at {GitRef}", repository.Name, gitRef);
else
{
_logger.LogError(e, "Failed to update {Name} from {RelativePath}, falling back to recreating from scratch", name, relativePath);
if (checkoutFolder.Exists)
FetchAndCheckout(repository, gitRef, checkoutFolder);
if (!pull)
{
return new Checkout
{
Directory = checkoutFolder,
HeadReference = gitRef,
Repository = repository,
};
}
try
{
ExecIn(checkoutFolder, "git", "pull", "--depth", "1", "--allow-unrelated-histories", "--no-ff", "origin", gitRef);
}
catch (Exception e)
{
_logger.LogError(e, "{RepositoryName}: Failed to update {GitRef} from {RelativePath}, falling back to recreating from scratch",
repository.Name, gitRef, checkoutFolder.FullName);
checkoutFolder.Delete(true);
checkoutFolder.Refresh();
return CloneRef(repository, gitRef, pull, attempt + 1);
}
return false;
}

head = Capture(checkoutFolder, "git", "rev-parse", "HEAD");
return new Checkout
{
Directory = checkoutFolder,
HeadReference = gitRef,
Repository = repository,
};
}

return true;
/// <summary>
/// Initializes the git repository if it is not already initialized.
/// Returns true if the repository was already initialized.
/// </summary>
private bool GitInit(Repository repository, IDirectoryInfo checkoutFolder)
{
var isGitAlreadyInitialized = Directory.Exists(Path.Combine(checkoutFolder.FullName, ".git"));
if (isGitAlreadyInitialized)
return true;
ExecIn(checkoutFolder, "git", "init");
ExecIn(checkoutFolder, "git", "remote", "add", "origin", repository.Origin);
return false;
}

private string CheckoutFromScratch(Repository repository, string name, string branch, string relativePath, IDirectoryInfo checkoutFolder)
private void FetchAndCheckout(Repository repository, string gitRef, IDirectoryInfo checkoutFolder)
{
_logger.LogInformation("Checkout: {Name}\t{Branch}\t{RelativePath}", name, branch, relativePath);
ExecIn(checkoutFolder, "git", "fetch", "--no-tags", "--prune", "--no-recurse-submodules", "--depth", "1", "origin", gitRef);
switch (repository.CheckoutStrategy)
{
case "full":
Exec("git", "clone", repository.Origin, checkoutFolder.FullName,
"--depth", "1", "--single-branch",
"--branch", branch
);
case CheckoutStrategy.Full:
ExecIn(checkoutFolder, "git", "sparse-checkout", "disable");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary? (not questioning it just looking to learn).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually unlikely.. But I was just thinking, if you ever change the checkout strategy and if you run the command, while the repo is still checked out (cached) you want to also disabled it.

break;
case "partial":
Exec(
"git", "clone", "--filter=blob:none", "--no-checkout", repository.Origin, checkoutFolder.FullName
);

ExecIn(checkoutFolder, "git", "sparse-checkout", "set", "--cone");
Comment on lines -158 to -161
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suprised how little effect --cone mode actually has on the size on the disk. Runs contrary to: https://github.blog/open-source/git/bring-your-monorepo-down-to-size-with-sparse-checkout/#sparse-checkout-and-partial-clones

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think because cone mode is the default according to https://git-scm.com/docs/git-sparse-checkout#_internalscone_mode_handling

ExecIn(checkoutFolder, "git", "checkout", branch);
case CheckoutStrategy.Partial:
ExecIn(checkoutFolder, "git", "sparse-checkout", "set", "docs");
break;
default:
throw new ArgumentOutOfRangeException(nameof(repository), repository.CheckoutStrategy, null);
}

return Capture(checkoutFolder, "git", "rev-parse", "HEAD");
ExecIn(checkoutFolder, "git", "checkout", "--force", gitRef);
}

private void Exec(string binary, params string[] args) => ExecIn(null, binary, args);

private void ExecIn(IDirectoryInfo? workingDirectory, string binary, params string[] args)
{
var arguments = new ExecArguments(binary, args)
Expand Down Expand Up @@ -221,7 +262,6 @@ string CaptureOutput()
return line;
}
}

}

public class NoopConsoleWriter : IConsoleOutWriter
Expand Down
Loading