| //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // Record tokens that a preprocessor emits and define operations to map between |
| // the tokens written in a file and tokens produced by the preprocessor. |
| // |
| // When running the compiler, there are two token streams we are interested in: |
| // - "spelled" tokens directly correspond to a substring written in some |
| // source file. |
| // - "expanded" tokens represent the result of preprocessing, parses consumes |
| // this token stream to produce the AST. |
| // |
| // Expanded tokens correspond directly to locations found in the AST, allowing |
| // to find subranges of the token stream covered by various AST nodes. Spelled |
| // tokens correspond directly to the source code written by the user. |
| // |
| // To allow composing these two use-cases, we also define operations that map |
| // between expanded and spelled tokens that produced them (macro calls, |
| // directives, etc). |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
| #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H |
| |
| #include "clang/Basic/FileManager.h" |
| #include "clang/Basic/LangOptions.h" |
| #include "clang/Basic/SourceLocation.h" |
| #include "clang/Basic/SourceManager.h" |
| #include "clang/Basic/TokenKinds.h" |
| #include "clang/Lex/Token.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include <cstdint> |
| #include <tuple> |
| |
| namespace clang { |
| class Preprocessor; |
| |
| namespace syntax { |
| |
| /// A half-open character range inside a particular file, the start offset is |
| /// included and the end offset is excluded from the range. |
| struct FileRange { |
| /// EXPECTS: File.isValid() && Begin <= End. |
| FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); |
| /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). |
| FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); |
| /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files |
| /// are the same. |
| FileRange(const SourceManager &SM, SourceLocation BeginLoc, |
| SourceLocation EndLoc); |
| |
| FileID file() const { return File; } |
| /// Start is a start offset (inclusive) in the corresponding file. |
| unsigned beginOffset() const { return Begin; } |
| /// End offset (exclusive) in the corresponding file. |
| unsigned endOffset() const { return End; } |
| |
| unsigned length() const { return End - Begin; } |
| |
| /// Check if \p Offset is inside the range. |
| bool contains(unsigned Offset) const { |
| return Begin <= Offset && Offset < End; |
| } |
| /// Check \p Offset is inside the range or equal to its endpoint. |
| bool touches(unsigned Offset) const { |
| return Begin <= Offset && Offset <= End; |
| } |
| |
| /// Gets the substring that this FileRange refers to. |
| llvm::StringRef text(const SourceManager &SM) const; |
| |
| /// Convert to the clang range. The returned range is always a char range, |
| /// never a token range. |
| CharSourceRange toCharRange(const SourceManager &SM) const; |
| |
| friend bool operator==(const FileRange &L, const FileRange &R) { |
| return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); |
| } |
| friend bool operator!=(const FileRange &L, const FileRange &R) { |
| return !(L == R); |
| } |
| |
| private: |
| FileID File; |
| unsigned Begin; |
| unsigned End; |
| }; |
| |
| /// For debugging purposes. |
| llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); |
| |
| /// A token coming directly from a file or from a macro invocation. Has just |
| /// enough information to locate the token in the source code. |
| /// Can represent both expanded and spelled tokens. |
| class Token { |
| public: |
| Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); |
| /// EXPECTS: clang::Token is not an annotation token. |
| explicit Token(const clang::Token &T); |
| |
| tok::TokenKind kind() const { return Kind; } |
| /// Location of the first character of a token. |
| SourceLocation location() const { return Location; } |
| /// Location right after the last character of a token. |
| SourceLocation endLocation() const { |
| return Location.getLocWithOffset(Length); |
| } |
| unsigned length() const { return Length; } |
| |
| /// Get the substring covered by the token. Note that will include all |
| /// digraphs, newline continuations, etc. E.g. tokens for 'int' and |
| /// in\ |
| /// t |
| /// both have the same kind tok::kw_int, but results of text() are different. |
| llvm::StringRef text(const SourceManager &SM) const; |
| |
| /// Gets a range of this token. |
| /// EXPECTS: token comes from a file, not from a macro expansion. |
| FileRange range(const SourceManager &SM) const; |
| |
| /// Given two tokens inside the same file, returns a file range that starts at |
| /// \p First and ends at \p Last. |
| /// EXPECTS: First and Last are file tokens from the same file, Last starts |
| /// after First. |
| static FileRange range(const SourceManager &SM, const syntax::Token &First, |
| const syntax::Token &Last); |
| |
| std::string dumpForTests(const SourceManager &SM) const; |
| /// For debugging purposes. |
| std::string str() const; |
| |
| private: |
| SourceLocation Location; |
| unsigned Length; |
| tok::TokenKind Kind; |
| }; |
| /// For debugging purposes. Equivalent to a call to Token::str(). |
| llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); |
| |
| /// A list of tokens obtained by preprocessing a text buffer and operations to |
| /// map between the expanded and spelled tokens, i.e. TokenBuffer has |
| /// information about two token streams: |
| /// 1. Expanded tokens: tokens produced by the preprocessor after all macro |
| /// replacements, |
| /// 2. Spelled tokens: corresponding directly to the source code of a file |
| /// before any macro replacements occurred. |
| /// Here's an example to illustrate a difference between those two: |
| /// #define FOO 10 |
| /// int a = FOO; |
| /// |
| /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. |
| /// Expanded tokens are {'int','a','=','10',';','eof'}. |
| /// |
| /// Note that the expanded token stream has a tok::eof token at the end, the |
| /// spelled tokens never store a 'eof' token. |
| /// |
| /// The full list expanded tokens can be obtained with expandedTokens(). Spelled |
| /// tokens for each of the files can be obtained via spelledTokens(FileID). |
| /// |
| /// To map between the expanded and spelled tokens use findSpelledByExpanded(). |
| /// |
| /// To build a token buffer use the TokenCollector class. You can also compute |
| /// the spelled tokens of a file using the tokenize() helper. |
| /// |
| /// FIXME: allow mappings into macro arguments. |
| class TokenBuffer { |
| public: |
| TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} |
| |
| TokenBuffer(TokenBuffer &&) = default; |
| TokenBuffer(const TokenBuffer &) = delete; |
| TokenBuffer &operator=(TokenBuffer &&) = default; |
| TokenBuffer &operator=(const TokenBuffer &) = delete; |
| |
| /// All tokens produced by the preprocessor after all macro replacements, |
| /// directives, etc. Source locations found in the clang AST will always |
| /// point to one of these tokens. |
| /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). |
| /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split |
| /// into two '>' tokens by the parser. However, TokenBuffer currently |
| /// keeps it as a single '>>' token. |
| llvm::ArrayRef<syntax::Token> expandedTokens() const { |
| return ExpandedTokens; |
| } |
| |
| /// Builds a cache to make future calls to expandedToken(SourceRange) faster. |
| /// Creates an index only once. Further calls to it will be no-op. |
| void indexExpandedTokens(); |
| |
| /// Returns the subrange of expandedTokens() corresponding to the closed |
| /// token range R. |
| /// Consider calling indexExpandedTokens() before for faster lookups. |
| llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; |
| |
| /// Returns the subrange of spelled tokens corresponding to AST node spanning |
| /// \p Expanded. This is the text that should be replaced if a refactoring |
| /// were to rewrite the node. If \p Expanded is empty, the returned value is |
| /// llvm::None. |
| /// |
| /// Will fail if the expanded tokens do not correspond to a sequence of |
| /// spelled tokens. E.g. for the following example: |
| /// |
| /// #define FIRST f1 f2 f3 |
| /// #define SECOND s1 s2 s3 |
| /// #define ID2(X, Y) X Y |
| /// |
| /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c |
| /// d ID2(e f g, h) i // expanded tokens are: d e f g h i |
| /// |
| /// the results would be: |
| /// expanded => spelled |
| /// ------------------------ |
| /// a => a |
| /// s1 s2 s3 => SECOND |
| /// a f1 f2 f3 => a FIRST |
| /// a f1 => can't map |
| /// s1 s2 => can't map |
| /// e f => e f |
| /// g h => can't map |
| /// |
| /// EXPECTS: \p Expanded is a subrange of expandedTokens(). |
| /// Complexity is logarithmic. |
| llvm::Optional<llvm::ArrayRef<syntax::Token>> |
| spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; |
| |
| /// Find the subranges of expanded tokens, corresponding to \p Spelled. |
| /// |
| /// Some spelled tokens may not be present in the expanded token stream, so |
| /// this function can return an empty vector, e.g. for tokens of macro |
| /// directives or disabled preprocessor branches. |
| /// |
| /// Some spelled tokens can be duplicated in the expanded token stream |
| /// multiple times and this function will return multiple results in those |
| /// cases. This happens when \p Spelled is inside a macro argument. |
| /// |
| /// FIXME: return correct results on macro arguments. For now, we return an |
| /// empty list. |
| /// |
| /// (!) will return empty vector on tokens from #define body: |
| /// E.g. for the following example: |
| /// |
| /// #define FIRST(A) f1 A = A f2 |
| /// #define SECOND s |
| /// |
| /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s |
| /// The results would be |
| /// spelled => expanded |
| /// ------------------------ |
| /// #define FIRST => {} |
| /// a FIRST(arg) => {a f1 arg = arg f2} |
| /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is |
| /// // after `=` in the expanded tokens. |
| llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> |
| expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
| |
| /// An expansion produced by the preprocessor, includes macro expansions and |
| /// preprocessor directives. Preprocessor always maps a non-empty range of |
| /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a |
| /// few examples of expansions: |
| /// #pragma once // Expands to an empty range. |
| /// #define FOO 1 2 3 // Expands an empty range. |
| /// FOO // Expands to "1 2 3". |
| /// FIXME(ibiryukov): implement this, currently #include expansions are empty. |
| /// #include <vector> // Expands to tokens produced by the include. |
| struct Expansion { |
| llvm::ArrayRef<syntax::Token> Spelled; |
| llvm::ArrayRef<syntax::Token> Expanded; |
| }; |
| /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting |
| /// a preprocessor directive) return the subrange of expanded tokens that the |
| /// macro expands to. |
| llvm::Optional<Expansion> |
| expansionStartingAt(const syntax::Token *Spelled) const; |
| /// Returns all expansions (partially) expanded from the specified tokens. |
| /// This is the expansions whose Spelled range intersects \p Spelled. |
| std::vector<Expansion> |
| expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; |
| |
| /// Lexed tokens of a file before preprocessing. E.g. for the following input |
| /// #define DECL(name) int name = 10 |
| /// DECL(a); |
| /// spelledTokens() returns |
| /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", |
| /// "DECL", "(", "a", ")", ";"} |
| llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; |
| |
| /// Returns the spelled Token starting at Loc, if there are no such tokens |
| /// returns nullptr. |
| const syntax::Token *spelledTokenAt(SourceLocation Loc) const; |
| |
| /// Get all tokens that expand a macro in \p FID. For the following input |
| /// #define FOO B |
| /// #define FOO2(X) int X |
| /// FOO2(XY) |
| /// int B; |
| /// FOO; |
| /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 |
| /// respecitvely). |
| std::vector<const syntax::Token *> macroExpansions(FileID FID) const; |
| |
| const SourceManager &sourceManager() const { return *SourceMgr; } |
| |
| std::string dumpForTests() const; |
| |
| private: |
| /// Describes a mapping between a continuous subrange of spelled tokens and |
| /// expanded tokens. Represents macro expansions, preprocessor directives, |
| /// conditionally disabled pp regions, etc. |
| /// #define FOO 1+2 |
| /// #define BAR(a) a + 1 |
| /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. |
| /// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, |
| /// macroTokens = {'BAR', '(', '1', ')'}. |
| struct Mapping { |
| // Positions in the corresponding spelled token stream. The corresponding |
| // range is never empty. |
| unsigned BeginSpelled = 0; |
| unsigned EndSpelled = 0; |
| // Positions in the expanded token stream. The corresponding range can be |
| // empty. |
| unsigned BeginExpanded = 0; |
| unsigned EndExpanded = 0; |
| |
| /// For debugging purposes. |
| std::string str() const; |
| }; |
| /// Spelled tokens of the file with information about the subranges. |
| struct MarkedFile { |
| /// Lexed, but not preprocessed, tokens of the file. These map directly to |
| /// text in the corresponding files and include tokens of all preprocessor |
| /// directives. |
| /// FIXME: spelled tokens don't change across FileID that map to the same |
| /// FileEntry. We could consider deduplicating them to save memory. |
| std::vector<syntax::Token> SpelledTokens; |
| /// A sorted list to convert between the spelled and expanded token streams. |
| std::vector<Mapping> Mappings; |
| /// The first expanded token produced for this FileID. |
| unsigned BeginExpanded = 0; |
| unsigned EndExpanded = 0; |
| }; |
| |
| friend class TokenCollector; |
| |
| /// Maps a single expanded token to its spelled counterpart or a mapping that |
| /// produced it. |
| std::pair<const syntax::Token *, const Mapping *> |
| spelledForExpandedToken(const syntax::Token *Expanded) const; |
| |
| /// Returns a mapping starting before \p Spelled token, or nullptr if no |
| /// such mapping exists. |
| static const Mapping * |
| mappingStartingBeforeSpelled(const MarkedFile &F, |
| const syntax::Token *Spelled); |
| |
| /// Convert a private Mapping to a public Expansion. |
| Expansion makeExpansion(const MarkedFile &, const Mapping &) const; |
| /// Returns the file that the Spelled tokens are taken from. |
| /// Asserts that they are non-empty, from a tracked file, and in-bounds. |
| const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; |
| |
| /// Token stream produced after preprocessing, conceputally this captures the |
| /// same stream as 'clang -E' (excluding the preprocessor directives like |
| /// #file, etc.). |
| std::vector<syntax::Token> ExpandedTokens; |
| // Index of ExpandedTokens for faster lookups by SourceLocation. |
| llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex; |
| llvm::DenseMap<FileID, MarkedFile> Files; |
| // The value is never null, pointer instead of reference to avoid disabling |
| // implicit assignment operator. |
| const SourceManager *SourceMgr; |
| }; |
| |
| /// The spelled tokens that overlap or touch a spelling location Loc. |
| /// This always returns 0-2 tokens. |
| llvm::ArrayRef<syntax::Token> |
| spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); |
| llvm::ArrayRef<syntax::Token> |
| spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); |
| |
| /// The identifier token that overlaps or touches a spelling location Loc. |
| /// If there is none, returns nullptr. |
| const syntax::Token * |
| spelledIdentifierTouching(SourceLocation Loc, |
| llvm::ArrayRef<syntax::Token> Tokens); |
| const syntax::Token * |
| spelledIdentifierTouching(SourceLocation Loc, |
| const syntax::TokenBuffer &Tokens); |
| |
| /// Lex the text buffer, corresponding to \p FID, in raw mode and record the |
| /// resulting spelled tokens. Does minimal post-processing on raw identifiers, |
| /// setting the appropriate token kind (instead of the raw_identifier reported |
| /// by lexer in raw mode). This is a very low-level function, most users should |
| /// prefer to use TokenCollector. Lexing in raw mode produces wildly different |
| /// results from what one might expect when running a C++ frontend, e.g. |
| /// preprocessor does not run at all. |
| /// The result will *not* have a 'eof' token at the end. |
| std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, |
| const LangOptions &LO); |
| /// Similar to one above, instead of whole file tokenizes a part of it. Note |
| /// that, the first token might be incomplete if FR.startOffset is not at the |
| /// beginning of a token, and the last token returned will start before the |
| /// FR.endOffset but might end after it. |
| std::vector<syntax::Token> |
| tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); |
| |
| /// Collects tokens for the main file while running the frontend action. An |
| /// instance of this object should be created on |
| /// FrontendAction::BeginSourceFile() and the results should be consumed after |
| /// FrontendAction::Execute() finishes. |
| class TokenCollector { |
| public: |
| /// Adds the hooks to collect the tokens. Should be called before the |
| /// preprocessing starts, i.e. as a part of BeginSourceFile() or |
| /// CreateASTConsumer(). |
| TokenCollector(Preprocessor &P); |
| |
| /// Finalizes token collection. Should be called after preprocessing is |
| /// finished, i.e. after running Execute(). |
| LLVM_NODISCARD TokenBuffer consume() &&; |
| |
| private: |
| /// Maps from a start to an end spelling location of transformations |
| /// performed by the preprocessor. These include: |
| /// 1. range from '#' to the last token in the line for PP directives, |
| /// 2. macro name and arguments for macro expansions. |
| /// Note that we record only top-level macro expansions, intermediate |
| /// expansions (e.g. inside macro arguments) are ignored. |
| /// |
| /// Used to find correct boundaries of macro calls and directives when |
| /// building mappings from spelled to expanded tokens. |
| /// |
| /// Logically, at each point of the preprocessor execution there is a stack of |
| /// macro expansions being processed and we could use it to recover the |
| /// location information we need. However, the public preprocessor API only |
| /// exposes the points when macro expansions start (when we push a macro onto |
| /// the stack) and not when they end (when we pop a macro from the stack). |
| /// To workaround this limitation, we rely on source location information |
| /// stored in this map. |
| using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; |
| class Builder; |
| class CollectPPExpansions; |
| |
| std::vector<syntax::Token> Expanded; |
| // FIXME: we only store macro expansions, also add directives(#pragma, etc.) |
| PPExpansions Expansions; |
| Preprocessor &PP; |
| CollectPPExpansions *Collector; |
| }; |
| |
| } // namespace syntax |
| } // namespace clang |
| |
| #endif |