/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/include/llvm/Support/TrigramIndex.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- TrigramIndex.h - a heuristic for SpecialCaseList --------*- C++ -*-===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // TrigramIndex implements a heuristic for SpecialCaseList that allows to |
10 | | // filter out ~99% incoming queries when all regular expressions in the |
11 | | // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more |
12 | | // complicated, the check is defeated and it will always pass the queries to a |
13 | | // full regex. |
14 | | // |
15 | | // The basic idea is that in order for a wildcard to match a query, the query |
16 | | // needs to have all trigrams which occur in the wildcard. We create a trigram |
17 | | // index (trigram -> list of rules with it) and then count trigrams in the query |
18 | | // for each rule. If the count for one of the rules reaches the expected value, |
19 | | // the check passes the query to a regex. If none of the rules got enough |
20 | | // trigrams, the check tells that the query is definitely not matched by any |
21 | | // of the rules, and no regex matching is needed. |
22 | | // A similar idea was used in Google Code Search as described in the blog post: |
23 | | // https://swtch.com/~rsc/regexp/regexp4.html |
24 | | // |
25 | | //===----------------------------------------------------------------------===// |
26 | | |
27 | | #ifndef LLVM_SUPPORT_TRIGRAMINDEX_H |
28 | | #define LLVM_SUPPORT_TRIGRAMINDEX_H |
29 | | |
30 | | #include "llvm/ADT/SmallVector.h" |
31 | | #include "llvm/ADT/StringMap.h" |
32 | | |
33 | | #include <string> |
34 | | #include <unordered_map> |
35 | | #include <vector> |
36 | | |
37 | | namespace llvm { |
38 | | class StringRef; |
39 | | |
40 | | class TrigramIndex { |
41 | | public: |
42 | | /// Inserts a new Regex into the index. |
43 | | void insert(std::string Regex); |
44 | | |
45 | | /// Returns true, if special case list definitely does not have a line |
46 | | /// that matches the query. Returns false, if it's not sure. |
47 | | bool isDefinitelyOut(StringRef Query) const; |
48 | | |
49 | | /// Returned true, iff the heuristic is defeated and not useful. |
50 | | /// In this case isDefinitelyOut always returns false. |
51 | 0 | bool isDefeated() { return Defeated; } |
52 | | private: |
53 | | // If true, the rules are too complicated for the check to work, and full |
54 | | // regex matching is needed for every rule. |
55 | | bool Defeated = false; |
56 | | // The minimum number of trigrams which should match for a rule to have a |
57 | | // chance to match the query. The number of elements equals the number of |
58 | | // regex rules in the SpecialCaseList. |
59 | | std::vector<unsigned> Counts; |
60 | | // Index holds a list of rules indices for each trigram. The same indices |
61 | | // are used in Counts to store per-rule limits. |
62 | | // If a trigram is too common (>4 rules with it), we stop tracking it, |
63 | | // which increases the probability for a need to match using regex, but |
64 | | // decreases the costs in the regular case. |
65 | | std::unordered_map<unsigned, SmallVector<size_t, 4>> Index{256}; |
66 | | }; |
67 | | |
68 | | } // namespace llvm |
69 | | |
70 | | #endif // LLVM_SUPPORT_TRIGRAMINDEX_H |