[clangd] Performance improvements and cleanup

- Inline SymbolID hashing to header
- Don't collect references for symbols without a SymbolID
- Store referenced symbols, rather than separately storing decls and
  macros.
- Don't defer ref collection to end of translation unit
- Perform const_cast when updating reference counts (~0.5% saving)
- Introduce caching for getSymbolID in SymbolCollector. (~30% saving)
- Don't modify symbolslab if there's no definition location
- Don't lex the whole file to deduce spelled tokens, just lex the
  relevant piece (~8%)

Overall this achieves ~38% reduction in time spent inside
SymbolCollector compared to baseline (on my machine :)).

I'd expect the last optimization to affect dynamic index a lot more, I
was testing with clangd-indexer on clangd subfolder of LLVM. As
clangd-indexer runs indexing of whole TU at once, we indeed see almost
every token from every source included in the TU (hence lexing full
files vs just lexing referenced tokens are almost the same), whereas
during dynamic indexing we mostly index main file symbols, but we would
touch the files defining/declaring those symbols, and lex complete files
for nothing, rather than just the token location.

The last optimization is also a functional change (added test),
previously we used raw tokens from syntax::tokenize, which didn't
canonicalize trigraphs/newlines in identifiers, wheres
Lexer::getSpelling canonicalizes them.

Differential Revision: https://reviews.llvm.org/D122894
This commit is contained in:
Kadir Cetinkaya 2022-04-08 09:56:43 +02:00
parent 5ef0ed7d5a
commit 001e88ac83
No known key found for this signature in database
GPG Key ID: E39E36B8D2057ED6
5 changed files with 134 additions and 119 deletions

View File

@ -21,11 +21,14 @@
#include "clang/AST/DeclBase.h"
#include "clang/AST/DeclObjC.h"
#include "clang/AST/DeclTemplate.h"
#include "clang/AST/DeclarationName.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Index/IndexSymbol.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Tooling/Syntax/Tokens.h"
#include "clang/Lex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
@ -171,6 +174,22 @@ const Decl *getRefContainer(const Decl *Enclosing,
return Enclosing;
}
// Check if there is an exact spelling of \p ND at \p Loc.
bool isSpelled(SourceLocation Loc, const NamedDecl &ND) {
auto Name = ND.getDeclName();
const auto NameKind = Name.getNameKind();
if (NameKind != DeclarationName::Identifier &&
NameKind != DeclarationName::CXXConstructorName)
return false;
const auto &AST = ND.getASTContext();
const auto &SM = AST.getSourceManager();
const auto &LO = AST.getLangOpts();
clang::Token Tok;
if (clang::Lexer::getRawToken(Loc, Tok, SM, LO))
return false;
auto StrName = Name.getAsString();
return clang::Lexer::getSpelling(Tok, SM, LO) == StrName;
}
} // namespace
// Encapsulates decisions about how to record header paths in the index,
@ -545,17 +564,17 @@ bool SymbolCollector::handleDeclOccurrence(
if (!ND)
return true;
auto ID = getSymbolIDCached(ND);
if (!ID)
return true;
// Mark D as referenced if this is a reference coming from the main file.
// D may not be an interesting symbol, but it's cheaper to check at the end.
auto &SM = ASTCtx->getSourceManager();
if (Opts.CountReferences &&
(Roles & static_cast<unsigned>(index::SymbolRole::Reference)) &&
SM.getFileID(SM.getSpellingLoc(Loc)) == SM.getMainFileID())
ReferencedDecls.insert(ND);
auto ID = getSymbolID(ND);
if (!ID)
return true;
ReferencedSymbols.insert(ID);
// ND is the canonical (i.e. first) declaration. If it's in the main file
// (which is not a header), then no public declaration was visible, so assume
@ -576,13 +595,6 @@ bool SymbolCollector::handleDeclOccurrence(
processRelations(*ND, ID, Relations);
bool CollectRef = static_cast<bool>(Opts.RefFilter & toRefKind(Roles));
bool IsOnlyRef =
!(Roles & (static_cast<unsigned>(index::SymbolRole::Declaration) |
static_cast<unsigned>(index::SymbolRole::Definition)));
if (IsOnlyRef && !CollectRef)
return true;
// Unlike other fields, e.g. Symbols (which use spelling locations), we use
// file locations for references (as it aligns the behavior of clangd's
// AST-based xref).
@ -590,13 +602,18 @@ bool SymbolCollector::handleDeclOccurrence(
if (CollectRef &&
(!IsMainFileOnly || Opts.CollectMainFileRefs ||
ND->isExternallyVisible()) &&
!isa<NamespaceDecl>(ND) &&
(Opts.RefsInHeaders ||
SM.getFileID(SM.getFileLoc(Loc)) == SM.getMainFileID()))
DeclRefs[ND].push_back(SymbolRef{SM.getFileLoc(Loc), Roles,
getRefContainer(ASTNode.Parent, Opts)});
!isa<NamespaceDecl>(ND)) {
auto FileLoc = SM.getFileLoc(Loc);
auto FID = SM.getFileID(FileLoc);
if (Opts.RefsInHeaders || FID == SM.getMainFileID()) {
addRef(ID, SymbolRef{FileLoc, FID, Roles,
getRefContainer(ASTNode.Parent, Opts),
isSpelled(FileLoc, *ND)});
}
}
// Don't continue indexing if this is a mere reference.
if (IsOnlyRef)
if (!(Roles & (static_cast<unsigned>(index::SymbolRole::Declaration) |
static_cast<unsigned>(index::SymbolRole::Definition))))
return true;
// FIXME: ObjCPropertyDecl are not properly indexed here:
@ -682,7 +699,7 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
Name->getName() == "__GCC_HAVE_DWARF2_CFI_ASM")
return true;
auto ID = getSymbolID(Name->getName(), MI, SM);
auto ID = getSymbolIDCached(Name->getName(), MI, SM);
if (!ID)
return true;
@ -693,9 +710,13 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
ASTCtx->getLangOpts());
// Do not store references to main-file macros.
if ((static_cast<unsigned>(Opts.RefFilter) & Roles) && !IsMainFileOnly &&
(Opts.RefsInHeaders || SM.getFileID(SpellingLoc) == SM.getMainFileID()))
(Opts.RefsInHeaders || SM.getFileID(SpellingLoc) == SM.getMainFileID())) {
// FIXME: Populate container information for macro references.
MacroRefs[ID].push_back({Loc, Roles, /*Container=*/nullptr});
// FIXME: All MacroRefs are marked as Spelled now, but this should be
// checked.
addRef(ID, SymbolRef{Loc, SM.getFileID(Loc), Roles, /*Container=*/nullptr,
/*Spelled=*/true});
}
// Collect symbols.
if (!Opts.CollectMacro)
@ -711,7 +732,7 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
if (Opts.CountReferences &&
(Roles & static_cast<unsigned>(index::SymbolRole::Reference)) &&
SM.getFileID(SpellingLoc) == SM.getMainFileID())
ReferencedMacros.insert(Name);
ReferencedSymbols.insert(ID);
// Don't continue indexing if this is a mere reference.
// FIXME: remove macro with ID if it is undefined.
@ -761,7 +782,7 @@ void SymbolCollector::processRelations(
continue;
const Decl *Object = R.RelatedSymbol;
auto ObjectID = getSymbolID(Object);
auto ObjectID = getSymbolIDCached(Object);
if (!ObjectID)
continue;
@ -792,16 +813,13 @@ void SymbolCollector::setIncludeLocation(const Symbol &S, SourceLocation Loc) {
void SymbolCollector::finish() {
// At the end of the TU, add 1 to the refcount of all referenced symbols.
auto IncRef = [this](const SymbolID &ID) {
for (const auto &ID : ReferencedSymbols) {
if (const auto *S = Symbols.find(ID)) {
Symbol Inc = *S;
++Inc.References;
Symbols.insert(Inc);
}
};
for (const NamedDecl *ND : ReferencedDecls) {
if (auto ID = getSymbolID(ND)) {
IncRef(ID);
// SymbolSlab::Builder returns const symbols because strings are interned
// and modifying returned symbols without inserting again wouldn't go
// well. const_cast is safe here as we're modifying a data owned by the
// Symbol. This reduces time spent in SymbolCollector by ~1%.
++const_cast<Symbol *>(S)->References;
}
}
if (Opts.CollectMacro) {
@ -809,16 +827,11 @@ void SymbolCollector::finish() {
// First, drop header guards. We can't identify these until EOF.
for (const IdentifierInfo *II : IndexedMacros) {
if (const auto *MI = PP->getMacroDefinition(II).getMacroInfo())
if (auto ID = getSymbolID(II->getName(), MI, PP->getSourceManager()))
if (auto ID =
getSymbolIDCached(II->getName(), MI, PP->getSourceManager()))
if (MI->isUsedForHeaderGuard())
Symbols.erase(ID);
}
// Now increment refcounts.
for (const IdentifierInfo *II : ReferencedMacros) {
if (const auto *MI = PP->getMacroDefinition(II).getMacroInfo())
if (auto ID = getSymbolID(II->getName(), MI, PP->getSourceManager()))
IncRef(ID);
}
}
// Fill in IncludeHeaders.
// We delay this until end of TU so header guards are all resolved.
@ -852,58 +865,7 @@ void SymbolCollector::finish() {
}
}
const auto &SM = ASTCtx->getSourceManager();
auto CollectRef = [&](SymbolID ID, const SymbolRef &LocAndRole,
bool Spelled = false) {
auto FileID = SM.getFileID(LocAndRole.Loc);
// FIXME: use the result to filter out references.
shouldIndexFile(FileID);
if (const auto *FE = SM.getFileEntryForID(FileID)) {
auto Range = getTokenRange(LocAndRole.Loc, SM, ASTCtx->getLangOpts());
Ref R;
R.Location.Start = Range.first;
R.Location.End = Range.second;
R.Location.FileURI = HeaderFileURIs->toURI(FE).c_str();
R.Kind = toRefKind(LocAndRole.Roles, Spelled);
R.Container = getSymbolID(LocAndRole.Container);
Refs.insert(ID, R);
}
};
// Populate Refs slab from MacroRefs.
// FIXME: All MacroRefs are marked as Spelled now, but this should be checked.
for (const auto &IDAndRefs : MacroRefs)
for (const auto &LocAndRole : IDAndRefs.second)
CollectRef(IDAndRefs.first, LocAndRole, /*Spelled=*/true);
// Populate Refs slab from DeclRefs.
llvm::DenseMap<FileID, std::vector<syntax::Token>> FilesToTokensCache;
for (auto &DeclAndRef : DeclRefs) {
if (auto ID = getSymbolID(DeclAndRef.first)) {
for (auto &LocAndRole : DeclAndRef.second) {
const auto FileID = SM.getFileID(LocAndRole.Loc);
// FIXME: It's better to use TokenBuffer by passing spelled tokens from
// the caller of SymbolCollector.
if (!FilesToTokensCache.count(FileID))
FilesToTokensCache[FileID] =
syntax::tokenize(FileID, SM, ASTCtx->getLangOpts());
llvm::ArrayRef<syntax::Token> Tokens = FilesToTokensCache[FileID];
// Check if the referenced symbol is spelled exactly the same way the
// corresponding NamedDecl is. If it is, mark this reference as spelled.
const auto *IdentifierToken =
spelledIdentifierTouching(LocAndRole.Loc, Tokens);
DeclarationName Name = DeclAndRef.first->getDeclName();
const auto NameKind = Name.getNameKind();
bool IsTargetKind = NameKind == DeclarationName::Identifier ||
NameKind == DeclarationName::CXXConstructorName;
bool Spelled = IdentifierToken && IsTargetKind &&
Name.getAsString() == IdentifierToken->text(SM);
CollectRef(ID, LocAndRole, Spelled);
}
}
}
ReferencedDecls.clear();
ReferencedMacros.clear();
DeclRefs.clear();
ReferencedSymbols.clear();
IncludeFiles.clear();
}
@ -983,16 +945,18 @@ void SymbolCollector::addDefinition(const NamedDecl &ND,
const Symbol &DeclSym) {
if (DeclSym.Definition)
return;
const auto &SM = ND.getASTContext().getSourceManager();
auto Loc = nameLocation(ND, SM);
shouldIndexFile(SM.getFileID(Loc));
auto DefLoc = getTokenLocation(Loc);
// If we saw some forward declaration, we end up copying the symbol.
// This is not ideal, but avoids duplicating the "is this a definition" check
// in clang::index. We should only see one definition.
if (!DefLoc)
return;
Symbol S = DeclSym;
const auto &SM = ND.getASTContext().getSourceManager();
auto Loc = nameLocation(ND, SM);
// FIXME: use the result to filter out symbols.
shouldIndexFile(SM.getFileID(Loc));
if (auto DefLoc = getTokenLocation(Loc))
S.Definition = *DefLoc;
S.Definition = *DefLoc;
Symbols.insert(S);
}
@ -1005,5 +969,36 @@ bool SymbolCollector::shouldIndexFile(FileID FID) {
return I.first->second;
}
void SymbolCollector::addRef(SymbolID ID, const SymbolRef &SR) {
const auto &SM = ASTCtx->getSourceManager();
// FIXME: use the result to filter out references.
shouldIndexFile(SR.FID);
if (const auto *FE = SM.getFileEntryForID(SR.FID)) {
auto Range = getTokenRange(SR.Loc, SM, ASTCtx->getLangOpts());
Ref R;
R.Location.Start = Range.first;
R.Location.End = Range.second;
R.Location.FileURI = HeaderFileURIs->toURI(FE).c_str();
R.Kind = toRefKind(SR.Roles, SR.Spelled);
R.Container = getSymbolIDCached(SR.Container);
Refs.insert(ID, R);
}
}
SymbolID SymbolCollector::getSymbolIDCached(const Decl *D) {
auto It = DeclToIDCache.try_emplace(D, SymbolID{});
if (It.second)
It.first->second = getSymbolID(D);
return It.first->second;
}
SymbolID SymbolCollector::getSymbolIDCached(const llvm::StringRef MacroName,
const MacroInfo *MI,
const SourceManager &SM) {
auto It = MacroToIDCache.try_emplace(MI, SymbolID{});
if (It.second)
It.first->second = getSymbolID(MacroName, MI, SM);
return It.first->second;
}
} // namespace clangd
} // namespace clang

View File

@ -8,11 +8,12 @@
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLCOLLECTOR_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLCOLLECTOR_H
#include "index/CanonicalIncludes.h"
#include "CollectMacros.h"
#include "index/CanonicalIncludes.h"
#include "index/Ref.h"
#include "index/Relation.h"
#include "index/Symbol.h"
#include "index/SymbolID.h"
#include "index/SymbolOrigin.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
@ -21,6 +22,7 @@
#include "clang/Index/IndexDataConsumer.h"
#include "clang/Index/IndexSymbol.h"
#include "clang/Sema/CodeCompleteConsumer.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include <functional>
@ -142,6 +144,10 @@ private:
llvm::Optional<std::string> getIncludeHeader(const Symbol &S, FileID);
SymbolID getSymbolIDCached(const Decl *D);
SymbolID getSymbolIDCached(const llvm::StringRef MacroName,
const MacroInfo *MI, const SourceManager &SM);
// All Symbols collected from the AST.
SymbolSlab::Builder Symbols;
// File IDs for Symbol.IncludeHeaders.
@ -164,14 +170,14 @@ private:
Options Opts;
struct SymbolRef {
SourceLocation Loc;
FileID FID;
index::SymbolRoleSet Roles;
const Decl *Container;
bool Spelled;
};
void addRef(SymbolID ID, const SymbolRef &SR);
// Symbols referenced from the current TU, flushed on finish().
llvm::DenseSet<const NamedDecl *> ReferencedDecls;
llvm::DenseSet<const IdentifierInfo *> ReferencedMacros;
llvm::DenseMap<const NamedDecl *, std::vector<SymbolRef>> DeclRefs;
llvm::DenseMap<SymbolID, std::vector<SymbolRef>> MacroRefs;
llvm::DenseSet<SymbolID> ReferencedSymbols;
// Maps canonical declaration provided by clang to canonical declaration for
// an index symbol, if clangd prefers a different declaration than that
// provided by clang. For example, friend declaration might be considered
@ -184,6 +190,8 @@ private:
// to insert for which symbol, etc.
class HeaderFileURICache;
std::unique_ptr<HeaderFileURICache> HeaderFileURIs;
llvm::DenseMap<const Decl *, SymbolID> DeclToIDCache;
llvm::DenseMap<const MacroInfo *, SymbolID> MacroToIDCache;
};
} // namespace clangd

View File

@ -46,14 +46,5 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID) {
return OS << llvm::toHex(ID.raw());
}
llvm::hash_code hash_value(const SymbolID &ID) {
// We already have a good hash, just return the first bytes.
static_assert(sizeof(size_t) <= SymbolID::RawSize,
"size_t longer than SHA1!");
size_t Result;
memcpy(&Result, ID.raw().data(), sizeof(size_t));
return llvm::hash_code(Result);
}
} // namespace clangd
} // namespace clang

View File

@ -14,6 +14,7 @@
#include "llvm/Support/Error.h"
#include "llvm/Support/raw_ostream.h"
#include <array>
#include <cstddef>
#include <cstdint>
#include <string>
@ -36,9 +37,7 @@ public:
bool operator==(const SymbolID &Sym) const {
return HashValue == Sym.HashValue;
}
bool operator!=(const SymbolID &Sym) const {
return !(*this == Sym);
}
bool operator!=(const SymbolID &Sym) const { return !(*this == Sym); }
bool operator<(const SymbolID &Sym) const {
return HashValue < Sym.HashValue;
}
@ -60,7 +59,14 @@ private:
std::array<uint8_t, RawSize> HashValue{};
};
llvm::hash_code hash_value(const SymbolID &ID);
inline llvm::hash_code hash_value(const SymbolID &ID) {
// We already have a good hash, just return the first bytes.
static_assert(sizeof(size_t) <= SymbolID::RawSize,
"size_t longer than SHA1!");
size_t Result;
memcpy(&Result, ID.raw().data(), sizeof(size_t));
return llvm::hash_code(Result);
}
// Write SymbolID into the given stream. SymbolID is encoded as ID.str().
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID);

View File

@ -1014,10 +1014,21 @@ TEST_F(SymbolCollectorTest, SpelledReferences) {
)cpp",
"Foo::Foo" /// constructor.
},
{ // Unclean identifiers
R"cpp(
struct Foo {};
)cpp",
R"cpp(
$spelled[[Fo\
o]] f{};
)cpp",
"Foo",
},
};
CollectorOpts.RefFilter = RefKind::All;
CollectorOpts.RefsInHeaders = false;
for (const auto& T : TestCases) {
SCOPED_TRACE(T.Header + "\n---\n" + T.Main);
Annotations Header(T.Header);
Annotations Main(T.Main);
// Reset the file system.
@ -1040,10 +1051,14 @@ TEST_F(SymbolCollectorTest, SpelledReferences) {
}
const auto SpelledRefs = std::move(SpelledSlabBuilder).build(),
ImplicitRefs = std::move(ImplicitSlabBuilder).build();
EXPECT_THAT(SpelledRefs,
Contains(Pair(TargetID, haveRanges(SpelledRanges))));
EXPECT_THAT(ImplicitRefs,
Contains(Pair(TargetID, haveRanges(ImplicitRanges))));
EXPECT_EQ(SpelledRanges.empty(), SpelledRefs.empty());
EXPECT_EQ(ImplicitRanges.empty(), ImplicitRefs.empty());
if (!SpelledRanges.empty())
EXPECT_THAT(SpelledRefs,
Contains(Pair(TargetID, haveRanges(SpelledRanges))));
if (!ImplicitRanges.empty())
EXPECT_THAT(ImplicitRefs,
Contains(Pair(TargetID, haveRanges(ImplicitRanges))));
}
}