diff --git a/Analyser/Analyser.csproj b/Analyser/Analyser.csproj index 0eadbfc..2224687 100644 --- a/Analyser/Analyser.csproj +++ b/Analyser/Analyser.csproj @@ -15,6 +15,20 @@ true + + + + + + + + Never + + + Never + + + diff --git a/Analyser/ConfigManager.cs b/Analyser/ConfigManager.cs index 8539534..3cee865 100644 --- a/Analyser/ConfigManager.cs +++ b/Analyser/ConfigManager.cs @@ -1,27 +1,12 @@ using System.IO; -using System; namespace JiebaNet.Analyser { public class ConfigManager { // TODO: duplicate codes. - public static string ConfigFileBaseDir - { - get - { - return "Resources"; - } - } + public static string ConfigFileBaseDir => "Resources"; - public static string IdfFile - { - get { return Path.Combine(ConfigFileBaseDir, "idf.txt"); } - } - - public static string StopWordsFile - { - get { return Path.Combine(ConfigFileBaseDir, "stopwords.txt"); } - } + public static string IdfFile => Path.Combine(ConfigFileBaseDir, "idf.txt"); } } \ No newline at end of file diff --git a/Analyser/IdfLoader.cs b/Analyser/IdfLoader.cs index 6cdce2a..1795ec3 100644 --- a/Analyser/IdfLoader.cs +++ b/Analyser/IdfLoader.cs @@ -1,47 +1,25 @@ using JiebaNet.Segmenter.Common; using System.Collections.Generic; -using System.IO; using System.Linq; -using System.Reflection; using System.Text; namespace JiebaNet.Analyser { public class IdfLoader { - internal string IdfFilePath { get; set; } internal IDictionary IdfFreq { get; set; } internal double MedianIdf { get; set; } - public IdfLoader(string idfPath = null) + public IdfLoader(IDictionary idfFreq) { - IdfFilePath = string.Empty; - IdfFreq = new Dictionary(); - MedianIdf = 0.0; - if (!string.IsNullOrWhiteSpace(idfPath)) - { - SetNewPath(idfPath); - } + IdfFreq = idfFreq ?? new Dictionary(); + MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2]; } - public void SetNewPath(string newIdfPath) + public void SetNewPath(IDictionary newIdfFreq) { - var idfPath = newIdfPath; - if (IdfFilePath != idfPath) - { - IdfFilePath = idfPath; - var lines = FileExtension.ReadEmbeddedAllLines(idfPath, Encoding.UTF8); - IdfFreq = new Dictionary(); - foreach (var line in lines) - { - var parts = line.Trim().Split(' '); - var word = parts[0]; - var freq = double.Parse(parts[1]); - IdfFreq[word] = freq; - } - - MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2]; - } + IdfFreq = IdfFreq?.Union(newIdfFreq).ToDictionary(k => k.Key, v => v.Value) ?? newIdfFreq; + MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2]; } } } \ No newline at end of file diff --git a/Analyser/KeywordExtractor.cs b/Analyser/KeywordExtractor.cs index f428043..db1b92b 100644 --- a/Analyser/KeywordExtractor.cs +++ b/Analyser/KeywordExtractor.cs @@ -1,7 +1,4 @@ -using JiebaNet.Segmenter.Common; -using Microsoft.Extensions.FileProviders; -using System.Collections.Generic; -using System.IO; +using System.Collections.Generic; namespace JiebaNet.Analyser { @@ -14,35 +11,28 @@ public abstract class KeywordExtractor "this", "then", "at", "have", "all", "not", "one", "has", "or", "that" }; - protected virtual ISet StopWords { get; set; } - - public void SetStopWords(string stopWordsFile) + protected ISet StopWords { get; set; } + public void SetStopWords(ISet stopWords) { - StopWords = new HashSet(); - var lines = FileExtension.ReadEmbeddedAllLines(stopWordsFile); - foreach (var line in lines) - { - StopWords.Add(line.Trim()); - } + StopWords = stopWords ?? new HashSet(); } public void AddStopWord(string word) { if (!StopWords.Contains(word)) - { StopWords.Add(word.Trim()); - } } public void AddStopWords(IEnumerable words) { foreach (var word in words) - { AddStopWord(word); - } } - public abstract IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null); - public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null); + public abstract IEnumerable ExtractTags(string text, int count = 20, + IEnumerable allowPos = null); + + public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20, + IEnumerable allowPos = null); } } \ No newline at end of file diff --git a/Analyser/TextRankExtractor.cs b/Analyser/TextRankExtractor.cs index 4d96e57..d65eb35 100644 --- a/Analyser/TextRankExtractor.cs +++ b/Analyser/TextRankExtractor.cs @@ -25,31 +25,39 @@ public bool PairFilter(IEnumerable allowPos, Pair wp) && !StopWords.Contains(wp.Word.ToLower()); } - public TextRankExtractor() + public TextRankExtractor(ISet stopWords) { Span = 5; Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); - SetStopWords(ConfigManager.StopWordsFile); + SetStopWords(stopWords); if (StopWords.IsEmpty()) - { StopWords.UnionWith(DefaultStopWords); - } } - public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null) + public override IEnumerable ExtractTags(string text, int count = 20, + IEnumerable allowPos = null) { var rank = ExtractTagRank(text, allowPos); - if (count <= 0) { count = 20; } + if (count <= 0) + { + count = 20; + } + return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); } - public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null) + public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, + IEnumerable allowPos = null) { var rank = ExtractTagRank(text, allowPos); - if (count <= 0) { count = 20; } - return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() + if (count <= 0) + { + count = 20; + } + + return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair { Word = p.Key, Weight = p.Value }).Take(count); @@ -60,9 +68,7 @@ public override IEnumerable ExtractTagsWithWeight(string text, i private IDictionary ExtractTagRank(string text, IEnumerable allowPos) { if (allowPos.IsEmpty()) - { allowPos = DefaultPosFilter; - } var g = new UndirectWeightedGraph(); var cm = new Dictionary(); @@ -71,27 +77,19 @@ private IDictionary ExtractTagRank(string text, IEnumerable= words.Count) - { - break; - } - if (!PairFilter(allowPos, words[j])) - { - continue; - } - - // TODO: better separator. - var key = wp.Word + "$" + words[j].Word; - if (!cm.ContainsKey(key)) - { - cm[key] = 0; - } - cm[key] += 1; - } + if (j >= words.Count) + break; + if (!PairFilter(allowPos, words[j])) + continue; + + // TODO: better separator. + var key = wp.Word + "$" + words[j].Word; + if (!cm.ContainsKey(key)) + cm[key] = 0; + cm[key] += 1; } } diff --git a/Analyser/TfidfExtractor.cs b/Analyser/TfidfExtractor.cs index fe33e38..4b58e42 100644 --- a/Analyser/TfidfExtractor.cs +++ b/Analyser/TfidfExtractor.cs @@ -1,5 +1,4 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; using JiebaNet.Segmenter; using JiebaNet.Segmenter.Common; @@ -10,7 +9,7 @@ namespace JiebaNet.Analyser public class TfidfExtractor : KeywordExtractor { private static readonly string DefaultIdfFile = ConfigManager.IdfFile; - private static readonly int DefaultWordCount = 20; + private const int DefaultWordCount = 20; private JiebaSegmenter Segmenter { get; set; } private PosSegmenter PosSegmenter { get; set; } @@ -19,32 +18,24 @@ public class TfidfExtractor : KeywordExtractor private IDictionary IdfFreq { get; set; } private double MedianIdf { get; set; } - public TfidfExtractor(JiebaSegmenter segmenter = null) + public TfidfExtractor(ISet stopWords, IDictionary idfFreq, + JiebaSegmenter segmenter = null) { - if (segmenter.IsNull()) - { - Segmenter = new JiebaSegmenter(); - } - else - { - Segmenter = segmenter; - } + Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter; PosSegmenter = new PosSegmenter(Segmenter); - SetStopWords(ConfigManager.StopWordsFile); + SetStopWords(stopWords); if (StopWords.IsEmpty()) - { StopWords.UnionWith(DefaultStopWords); - } - Loader = new IdfLoader(DefaultIdfFile); + Loader = new IdfLoader(idfFreq); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; } - public void SetIdfPath(string idfPath) + public void SetIdfPath(IDictionary idfFreq) { - Loader.SetNewPath(idfPath); + Loader.SetNewPath(idfFreq); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; } @@ -58,14 +49,7 @@ private IEnumerable FilterCutByPos(string text, IEnumerable allo private IDictionary GetWordIfidf(string text, IEnumerable allowPos) { IEnumerable words = null; - if (allowPos.IsNotEmpty()) - { - words = FilterCutByPos(text, allowPos); - } - else - { - words = Segmenter.Cut(text); - } + words = allowPos.IsNotEmpty() ? FilterCutByPos(text, allowPos) : Segmenter.Cut(text); // Calculate TF var freq = new Dictionary(); @@ -73,31 +57,32 @@ private IDictionary GetWordIfidf(string text, IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null) + public override IEnumerable ExtractTags(string text, int count = 20, + IEnumerable allowPos = null) { - if (count <= 0) { count = DefaultWordCount; } + if (count <= 0) + count = DefaultWordCount; var freq = GetWordIfidf(text, allowPos); return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); } - public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null) + public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, + IEnumerable allowPos = null) { - if (count <= 0) { count = DefaultWordCount; } + if (count <= 0) + count = DefaultWordCount; var freq = GetWordIfidf(text, allowPos); return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() @@ -112,4 +97,4 @@ public class WordWeightPair public string Word { get; set; } public double Weight { get; set; } } -} +} \ No newline at end of file diff --git a/Analyser/UndirectWeightedGraph.cs b/Analyser/UndirectWeightedGraph.cs index b1afd0c..fe622cc 100644 --- a/Analyser/UndirectWeightedGraph.cs +++ b/Analyser/UndirectWeightedGraph.cs @@ -1,4 +1,3 @@ -using System; using System.Collections.Generic; using System.Linq; @@ -13,9 +12,10 @@ public class Edge public class UndirectWeightedGraph { - private static readonly double d = 0.85; + private const double D = 0.85; + + public IDictionary> Graph { get; set; } - public IDictionary> Graph { get; set; } public UndirectWeightedGraph() { Graph = new Dictionary>(); @@ -24,17 +24,13 @@ public UndirectWeightedGraph() public void AddEdge(string start, string end, double weight) { if (!Graph.ContainsKey(start)) - { Graph[start] = new List(); - } if (!Graph.ContainsKey(end)) - { Graph[end] = new List(); - } - Graph[start].Add(new Edge(){ Start = start, End = end, Weight = weight }); - Graph[end].Add(new Edge(){ Start = end, End = start, Weight = weight }); + Graph[start].Add(new Edge() {Start = start, End = end, Weight = weight}); + Graph[end].Add(new Edge() {Start = end, End = start, Weight = weight}); } public IDictionary Rank() @@ -44,7 +40,7 @@ public IDictionary Rank() // init scores var count = Graph.Count > 0 ? Graph.Count : 1; - var wsdef = 1.0/count; + var wsdef = 1.0 / count; foreach (var pair in Graph) { @@ -60,10 +56,8 @@ public IDictionary Rank() { var s = 0d; foreach (var edge in Graph[n]) - { - s += edge.Weight/outSum[edge.End]*ws[edge.End]; - } - ws[n] = (1 - d) + d*s; + s += edge.Weight / outSum[edge.End] * ws[edge.End]; + ws[n] = (1 - D) + D * s; } } @@ -73,19 +67,13 @@ public IDictionary Rank() foreach (var w in ws.Values) { if (w < minRank) - { minRank = w; - } - if(w > maxRank) - { + if (w > maxRank) maxRank = w; - } } foreach (var pair in ws.ToList()) - { - ws[pair.Key] = (pair.Value - minRank/10.0)/(maxRank - minRank/10.0); - } + ws[pair.Key] = (pair.Value - minRank / 10.0) / (maxRank - minRank / 10.0); return ws; } diff --git a/Segmenter/Common/Counter.cs b/Segmenter/Common/Counter.cs index 692613b..1feb523 100644 --- a/Segmenter/Common/Counter.cs +++ b/Segmenter/Common/Counter.cs @@ -1,5 +1,4 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; namespace JiebaNet.Segmenter.Common @@ -53,30 +52,32 @@ public interface ICounter bool Contains(T key); } - public class Counter: ICounter + public class Counter : ICounter { - private Dictionary data = new Dictionary(); + private readonly Dictionary _data = new Dictionary(); - public Counter() {} + private Counter() + { + } public Counter(IEnumerable items) { CountItems(items); } - public int Count => data.Count; - public int Total => data.Values.Sum(); - public IEnumerable> Elements => data; + public int Count => _data.Count; + public int Total => _data.Values.Sum(); + public IEnumerable> Elements => _data; public int this[T key] { - get => data.ContainsKey(key) ? data[key] : 0; - set => data[key] = value; + get => _data.ContainsKey(key) ? _data[key] : 0; + set => _data[key] = value; } public IEnumerable> MostCommon(int n = -1) { - var pairs = data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value); + var pairs = _data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value); return n < 0 ? pairs : pairs.Take(n); } @@ -103,7 +104,7 @@ public void Add(ICounter other) public ICounter Union(ICounter other) { var result = new Counter(); - foreach (var pair in data) + foreach (var pair in _data) { var count = pair.Value; var otherCount = other[pair.Key]; @@ -112,31 +113,26 @@ public ICounter Union(ICounter other) } foreach (var pair in other.Elements) - { if (!Contains(pair.Key)) - { result[pair.Key] = pair.Value; - } - } + return result; } public void Remove(T key) { - if (data.ContainsKey(key)) - { - data.Remove(key); - } + if (_data.ContainsKey(key)) + _data.Remove(key); } public void Clear() { - data.Clear(); + _data.Clear(); } public bool Contains(T key) { - return data.ContainsKey(key); + return _data.ContainsKey(key); } #region Private Methods @@ -144,35 +140,27 @@ public bool Contains(T key) private void CountItems(IEnumerable items) { foreach (var item in items) - { - data[item] = data.GetDefault(item, 0) + 1; - } + _data[item] = _data.GetDefault(item, 0) + 1; } private void CountPairs(IEnumerable> pairs) { foreach (var pair in pairs) - { this[pair.Key] += pair.Value; - } } private void SubtractItems(IEnumerable items) { foreach (var item in items) - { - data[item] = data.GetDefault(item, 0) - 1; - } + _data[item] = _data.GetDefault(item, 0) - 1; } private void SubtractPairs(IEnumerable> pairs) { foreach (var pair in pairs) - { this[pair.Key] -= pair.Value; - } } #endregion } -} +} \ No newline at end of file diff --git a/Segmenter/Common/Extensions.cs b/Segmenter/Common/Extensions.cs index 5da7888..7ca6685 100644 --- a/Segmenter/Common/Extensions.cs +++ b/Segmenter/Common/Extensions.cs @@ -1,5 +1,4 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; @@ -24,7 +23,6 @@ public static bool IsNotNull(this object obj) #endregion - #region Enumerable public static bool IsEmpty(this IEnumerable enumerable) @@ -42,21 +40,16 @@ public static TValue GetValueOrDefault(this IDictionary(this IDictionary dict, TKey key, TValue defaultValue) + public static TValue GetDefault(this IDictionary dict, TKey key, + TValue defaultValue) { - if (dict.ContainsKey(key)) - { - return dict[key]; - } - return defaultValue; + return dict.ContainsKey(key) ? dict[key] : defaultValue; } public static void Update(this IDictionary dict, IDictionary other) { foreach (var key in other.Keys) - { dict[key] = other[key]; - } } #endregion @@ -65,23 +58,12 @@ public static void Update(this IDictionary dict, IDi public static string Left(this string s, int endIndex) { - if (string.IsNullOrEmpty(s)) - { - return s; - } - - return s.Substring(0, endIndex); + return string.IsNullOrEmpty(s) ? s : s.Substring(0, endIndex); } public static string Right(this string s, int startIndex) { - if (string.IsNullOrEmpty(s)) - { - return s; - } - - - return s.Substring(startIndex); + return string.IsNullOrEmpty(s) ? s : s.Substring(startIndex); } public static string Sub(this string s, int startIndex, int endIndex) @@ -93,7 +75,7 @@ public static bool IsInt32(this string s) { return RegexDigits.IsMatch(s); } - + public static string[] SplitLines(this string s) { return RegexNewline.Split(s); @@ -107,7 +89,7 @@ public static string Join(this IEnumerable inputs, string separator = ", public static IEnumerable SubGroupValues(this GroupCollection groups) { var result = from Group g in groups - select g.Value; + select g.Value; return result.Skip(1); } @@ -122,7 +104,7 @@ public static int ToInt32(this char ch) public static char ToChar(this int i) { - return (char)i; + return (char) i; } #endregion diff --git a/Segmenter/Common/FileExtension.cs b/Segmenter/Common/FileExtension.cs index 1dd0157..f70e244 100644 --- a/Segmenter/Common/FileExtension.cs +++ b/Segmenter/Common/FileExtension.cs @@ -1,5 +1,4 @@ using Microsoft.Extensions.FileProviders; -using System; using System.Collections.Generic; using System.IO; using System.Reflection; @@ -14,29 +13,26 @@ public static string ReadEmbeddedAllLine(string path) return ReadEmbeddedAllLine(path, Encoding.UTF8); } - public static string ReadEmbeddedAllLine(string path,Encoding encoding) + public static string ReadEmbeddedAllLine(string path, Encoding encoding) { var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly); var fileInfo = provider.GetFileInfo(path); using (var sr = new StreamReader(fileInfo.CreateReadStream(), encoding)) - { return sr.ReadToEnd(); - } } public static List ReadEmbeddedAllLines(string path, Encoding encoding) { var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly); var fileInfo = provider.GetFileInfo(path); - List list = new List(); - using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding)) + var list = new List(); + using (var streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding)) { string item; while ((item = streamReader.ReadLine()) != null) - { list.Add(item); - } } + return list; } @@ -45,4 +41,4 @@ public static List ReadEmbeddedAllLines(string path) return ReadEmbeddedAllLines(path, Encoding.UTF8); } } -} +} \ No newline at end of file diff --git a/Segmenter/Common/Trie.cs b/Segmenter/Common/Trie.cs index 3e8c2a5..06b7492 100644 --- a/Segmenter/Common/Trie.cs +++ b/Segmenter/Common/Trie.cs @@ -15,7 +15,7 @@ public TrieNode(char ch) { Char = ch; Frequency = 0; - + // TODO: or an empty dict? //Children = null; } @@ -23,48 +23,33 @@ public TrieNode(char ch) public int Insert(string s, int pos, int freq = 1) { if (string.IsNullOrEmpty(s) || pos >= s.Length) - { return 0; - } if (Children == null) - { Children = new Dictionary(); - } var c = s[pos]; if (!Children.ContainsKey(c)) - { Children[c] = new TrieNode(c); - } var curNode = Children[c]; - if (pos == s.Length - 1) - { - curNode.Frequency += freq; - return curNode.Frequency; - } + if (pos != s.Length - 1) return curNode.Insert(s, pos + 1, freq); + curNode.Frequency += freq; + return curNode.Frequency; - return curNode.Insert(s, pos + 1, freq); } public TrieNode Search(string s, int pos) { if (string.IsNullOrEmpty(s)) - { return null; - } // if out of range or without any child nodes if (pos >= s.Length || Children == null) - { return null; - } // if reaches the last char of s, it's time to make the decision. if (pos == s.Length - 1) - { return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null; - } // continue if necessary. return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null; } @@ -75,7 +60,9 @@ public interface ITrie //string BestMatch(string word, long maxTime); bool Contains(string word); int Frequency(string word); + int Insert(string word, int freq = 1); + //bool Remove(string word); int Count { get; } int TotalFrequency { get; } @@ -83,9 +70,9 @@ public interface ITrie public class Trie : ITrie { - private static readonly char RootChar = '\0'; + private const char RootChar = '\0'; - internal TrieNode Root; + internal readonly TrieNode Root; public int Count { get; private set; } public int TotalFrequency { get; private set; } @@ -125,11 +112,9 @@ public int Insert(string word, int freq = 1) CheckWord(word); var i = Root.Insert(word.Trim(), 0, freq); - if (i > 0) - { - TotalFrequency += freq; - Count++; - } + if (i <= 0) return i; + TotalFrequency += freq; + Count++; return i; } @@ -143,9 +128,7 @@ public IEnumerable ChildChars(string prefix) private void CheckWord(string word) { if (string.IsNullOrWhiteSpace(word)) - { throw new ArgumentException("word must not be null or whitespace"); - } } } -} +} \ No newline at end of file diff --git a/Segmenter/ConfigManager.cs b/Segmenter/ConfigManager.cs index af2f9a0..803e09e 100644 --- a/Segmenter/ConfigManager.cs +++ b/Segmenter/ConfigManager.cs @@ -1,5 +1,4 @@ -using System; -using System.IO; +using System.IO; namespace JiebaNet.Segmenter { @@ -9,44 +8,23 @@ public static string ConfigFileBaseDir { get { - var configFileDir = "Resources"; + const string configFileDir = "Resources"; return configFileDir; } } - public static string MainDictFile - { - get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); } - } + public static string MainDictFile => Path.Combine(ConfigFileBaseDir, "dict.txt"); - public static string ProbTransFile - { - get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); } - } + public static string ProbTransFile => Path.Combine(ConfigFileBaseDir, "prob_trans.json"); - public static string ProbEmitFile - { - get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); } - } + public static string ProbEmitFile => Path.Combine(ConfigFileBaseDir, "prob_emit.json"); - public static string PosProbStartFile - { - get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); } - } + public static string PosProbStartFile => Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); - public static string PosProbTransFile - { - get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); } - } + public static string PosProbTransFile => Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); - public static string PosProbEmitFile - { - get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); } - } + public static string PosProbEmitFile => Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); - public static string CharStateTabFile - { - get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); } - } + public static string CharStateTabFile => Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); } } \ No newline at end of file diff --git a/Segmenter/Constants.cs b/Segmenter/Constants.cs index 4315161..99be134 100644 --- a/Segmenter/Constants.cs +++ b/Segmenter/Constants.cs @@ -5,11 +5,13 @@ namespace JiebaNet.Segmenter { public class Constants { - public static readonly double MinProb = -3.14e100; + public const double MinProb = -3.14e100; - public static readonly List NounPos = new List() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" }; - public static readonly List VerbPos = new List() { "v", "vd", "vg", "vi", "vn", "vq" }; + public static readonly List NounPos = new List() + {"n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz"}; + + public static readonly List VerbPos = new List() {"v", "vd", "vg", "vi", "vn", "vq"}; public static readonly List NounAndVerbPos = NounPos.Union(VerbPos).ToList(); - public static readonly List IdiomPos = new List() { "i" }; + public static readonly List IdiomPos = new List() {"i"}; } -} +} \ No newline at end of file diff --git a/Segmenter/DefaultDictionary.cs b/Segmenter/DefaultDictionary.cs index 45bac66..e6690ae 100644 --- a/Segmenter/DefaultDictionary.cs +++ b/Segmenter/DefaultDictionary.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using System.Collections.Generic; namespace JiebaNet.Segmenter { @@ -16,9 +12,10 @@ public class DefaultDictionary : Dictionary { Add(key, default(TValue)); } + return base[key]; } - set { base[key] = value; } + set => base[key] = value; } } -} +} \ No newline at end of file diff --git a/Segmenter/FinalSeg/IFinalSeg.cs b/Segmenter/FinalSeg/IFinalSeg.cs index ed88037..f8e6d04 100644 --- a/Segmenter/FinalSeg/IFinalSeg.cs +++ b/Segmenter/FinalSeg/IFinalSeg.cs @@ -1,10 +1,9 @@ -using System; using System.Collections.Generic; namespace JiebaNet.Segmenter.FinalSeg { /// - /// ڴʵз֮ʹô˽ӿڽз֣ĬʵΪHMM + /// 在词典切分之后,使用此接口进行切分,默认实现为HMM方法。 /// public interface IFinalSeg { diff --git a/Segmenter/FinalSeg/Viterbi.cs b/Segmenter/FinalSeg/Viterbi.cs index ca78aa6..7e3df38 100644 --- a/Segmenter/FinalSeg/Viterbi.cs +++ b/Segmenter/FinalSeg/Viterbi.cs @@ -2,7 +2,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.Linq; -using System.Text; using System.Text.RegularExpressions; using JiebaNet.Segmenter.Common; using Newtonsoft.Json; @@ -12,7 +11,7 @@ namespace JiebaNet.Segmenter.FinalSeg public class Viterbi : IFinalSeg { private static readonly Lazy Lazy = new Lazy(() => new Viterbi()); - private static readonly char[] States = { 'B', 'M', 'E', 'S' }; + private static readonly char[] States = {'B', 'M', 'E', 'S'}; private static readonly Regex RegexChinese = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled); private static readonly Regex RegexSkip = new Regex(@"([a-zA-Z0-9]+(?:\.\d+)?%?)", RegexOptions.Compiled); @@ -28,10 +27,7 @@ private Viterbi() } // TODO: synchronized - public static Viterbi Instance - { - get { return Lazy.Value; } - } + public static Viterbi Instance => Lazy.Value; public IEnumerable Cut(string sentence) { @@ -39,15 +35,14 @@ public IEnumerable Cut(string sentence) foreach (var blk in RegexChinese.Split(sentence)) { if (RegexChinese.IsMatch(blk)) - { tokens.AddRange(ViterbiCut(blk)); - } else { var segments = RegexSkip.Split(blk).Where(seg => !string.IsNullOrEmpty(seg)); tokens.AddRange(segments); } } + return tokens; } @@ -60,10 +55,10 @@ private void LoadModel() _prevStatus = new Dictionary() { - {'B', new []{'E', 'S'}}, - {'M', new []{'M', 'B'}}, - {'S', new []{'S', 'E'}}, - {'E', new []{'B', 'M'}} + {'B', new[] {'E', 'S'}}, + {'M', new[] {'M', 'B'}}, + {'S', new[] {'S', 'E'}}, + {'E', new[] {'B', 'M'}} }; _startProbs = new Dictionary() @@ -108,20 +103,20 @@ private IEnumerable ViterbiCut(string sentence) { var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb); - Pair candidate = new Pair('\0', double.MinValue); + var candidate = new Pair('\0', double.MinValue); foreach (var y0 in _prevStatus[y]) { var tranp = _transProbs[y0].GetDefault(y, Constants.MinProb); tranp = v[i - 1][y0] + tranp + emp; - if (candidate.Freq <= tranp) - { - candidate.Freq = tranp; - candidate.Key = y0; - } + if (!(candidate.Freq <= tranp)) continue; + candidate.Freq = tranp; + candidate.Key = y0; } + vv[y] = candidate.Freq; newPath[y] = new Node(y, path[candidate.Key]); } + path = newPath; } @@ -135,6 +130,7 @@ private IEnumerable ViterbiCut(string sentence) posList.Add(finalPath.Value); finalPath = finalPath.Parent; } + posList.Reverse(); var tokens = new List(); @@ -142,23 +138,24 @@ private IEnumerable ViterbiCut(string sentence) for (var i = 0; i < sentence.Length; i++) { var pos = posList[i]; - if (pos == 'B') - begin = i; - else if (pos == 'E') - { - tokens.Add(sentence.Sub(begin, i + 1)); - next = i + 1; - } - else if (pos == 'S') + switch (pos) { - tokens.Add(sentence.Sub(i, i + 1)); - next = i + 1; + case 'B': + begin = i; + break; + case 'E': + tokens.Add(sentence.Sub(begin, i + 1)); + next = i + 1; + break; + case 'S': + tokens.Add(sentence.Sub(i, i + 1)); + next = i + 1; + break; } } + if (next < sentence.Length) - { tokens.Add(sentence.Substring(next)); - } return tokens; } diff --git a/Segmenter/JiebaSegmenter.cs b/Segmenter/JiebaSegmenter.cs index 98bd7b3..18787e4 100644 --- a/Segmenter/JiebaSegmenter.cs +++ b/Segmenter/JiebaSegmenter.cs @@ -16,13 +16,14 @@ public class JiebaSegmenter private static readonly IFinalSeg FinalSeg = Viterbi.Instance; private static readonly ISet LoadedPath = new HashSet(); - private static readonly object locker = new object(); + private static readonly object Locker = new object(); internal IDictionary UserWordTagTab { get; set; } #region Regular Expressions - internal static readonly Regex RegexChineseDefault = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", RegexOptions.Compiled); + internal static readonly Regex RegexChineseDefault = + new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", RegexOptions.Compiled); internal static readonly Regex RegexSkipDefault = new Regex(@"(\r\n|\s)", RegexOptions.Compiled); @@ -31,7 +32,8 @@ public class JiebaSegmenter internal static readonly Regex RegexEnglishChars = new Regex(@"[a-zA-Z0-9]", RegexOptions.Compiled); - internal static readonly Regex RegexUserDict = new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled); + internal static readonly Regex RegexUserDict = + new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled); #endregion @@ -52,7 +54,7 @@ public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true { var reHan = RegexChineseDefault; var reSkip = RegexSkipDefault; - Func> cutMethod = null; + Func> cutMethod; if (cutAll) { @@ -61,17 +63,11 @@ public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true } if (cutAll) - { cutMethod = CutAll; - } else if (hmm) - { cutMethod = CutDag; - } else - { cutMethod = CutDagWithoutHmm; - } return CutIt(text, cutMethod, reHan, reSkip, cutAll); } @@ -84,28 +80,12 @@ public IEnumerable CutForSearch(string text, bool hmm = true) foreach (var w in words) { if (w.Length > 2) - { - foreach (var i in Enumerable.Range(0, w.Length - 1)) - { - var gram2 = w.Substring(i, 2); - if (WordDict.ContainsWord(gram2)) - { - result.Add(gram2); - } - } - } + result.AddRange(Enumerable.Range(0, w.Length - 1).Select(i => w.Substring(i, 2)) + .Where(gram2 => WordDict.ContainsWord(gram2))); if (w.Length > 3) - { - foreach (var i in Enumerable.Range(0, w.Length - 2)) - { - var gram3 = w.Substring(i, 3); - if (WordDict.ContainsWord(gram3)) - { - result.Add(gram3); - } - } - } + result.AddRange(Enumerable.Range(0, w.Length - 2).Select(i => w.Substring(i, 3)) + .Where(gram3 => WordDict.ContainsWord(gram3))); result.Add(w); } @@ -138,20 +118,17 @@ public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMo { var gram2 = w.Substring(i, 2); if (WordDict.ContainsWord(gram2)) - { result.Add(new Token(gram2, start + i, start + i + 2)); - } } } + if (width > 3) { for (var i = 0; i < width - 2; i++) { var gram3 = w.Substring(i, 3); if (WordDict.ContainsWord(gram3)) - { result.Add(new Token(gram3, start + i, start + i + 3)); - } } } @@ -170,31 +147,33 @@ internal IDictionary> GetDag(string sentence) var dag = new Dictionary>(); var trie = WordDict.Trie; - var N = sentence.Length; + var n = sentence.Length; for (var k = 0; k < sentence.Length; k++) { - var templist = new List(); + var tempList = new List(); var i = k; var frag = sentence.Substring(k, 1); - while (i < N && trie.ContainsKey(frag)) + while (i < n && trie.ContainsKey(frag)) { if (trie[frag] > 0) { - templist.Add(i); + tempList.Add(i); } i++; // TODO: - if (i < N) + if (i < n) { frag = sentence.Sub(k, i + 1); } } - if (templist.Count == 0) + + if (tempList.Count == 0) { - templist.Add(k); + tempList.Add(k); } - dag[k] = templist; + + dag[k] = tempList; } return dag; @@ -203,24 +182,24 @@ internal IDictionary> GetDag(string sentence) internal IDictionary> Calc(string sentence, IDictionary> dag) { var n = sentence.Length; - var route = new Dictionary>(); - route[n] = new Pair(0, 0.0); + var route = new Dictionary> {[n] = new Pair(0, 0.0)}; var logtotal = Math.Log(WordDict.Total); for (var i = n - 1; i > -1; i--) { var candidate = new Pair(-1, double.MinValue); - foreach (int x in dag[i]) + foreach (var x in dag[i]) { - var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal + route[x + 1].Freq; - if (candidate.Freq < freq) - { - candidate.Freq = freq; - candidate.Key = x; - } + var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal + + route[x + 1].Freq; + if (!(candidate.Freq < freq)) continue; + candidate.Freq = freq; + candidate.Key = x; } + route[i] = candidate; } + return route; } @@ -244,11 +223,9 @@ internal IEnumerable CutAll(string sentence) { foreach (var j in nexts) { - if (j > k) - { - words.Add(sentence.Substring(k, j + 1 - k)); - lastPos = j; - } + if (j <= k) continue; + words.Add(sentence.Substring(k, j + 1 - k)); + lastPos = j; } } } @@ -281,8 +258,10 @@ internal IEnumerable CutDag(string sentence) AddBufferToWordList(tokens, buf); buf = string.Empty; } + tokens.Add(w); } + x = y; } @@ -302,17 +281,16 @@ internal IEnumerable CutDagWithoutHmm(string sentence) var words = new List(); var x = 0; - string buf = string.Empty; - var N = sentence.Length; + var buf = string.Empty; + var sentenceLength = sentence.Length; - var y = -1; - while (x < N) + while (x < sentenceLength) { - y = route[x].Key + 1; - var l_word = sentence.Substring(x, y - x); - if (RegexEnglishChars.IsMatch(l_word) && l_word.Length == 1) + var y = route[x].Key + 1; + var lWord = sentence.Substring(x, y - x); + if (RegexEnglishChars.IsMatch(lWord) && lWord.Length == 1) { - buf += l_word; + buf += lWord; x = y; } else @@ -322,7 +300,8 @@ internal IEnumerable CutDagWithoutHmm(string sentence) words.Add(buf); buf = string.Empty; } - words.Add(l_word); + + words.Add(lWord); x = y; } } @@ -336,7 +315,7 @@ internal IEnumerable CutDagWithoutHmm(string sentence) } internal IEnumerable CutIt(string text, Func> cutMethod, - Regex reHan, Regex reSkip, bool cutAll) + Regex reHan, Regex reSkip, bool cutAll) { var result = new List(); var blocks = reHan.Split(text); @@ -349,10 +328,7 @@ internal IEnumerable CutIt(string text, Func if (reHan.IsMatch(blk)) { - foreach (var word in cutMethod(blk)) - { - result.Add(word); - } + result.AddRange(cutMethod(blk)); } else { @@ -360,20 +336,12 @@ internal IEnumerable CutIt(string text, Func foreach (var x in tmp) { if (reSkip.IsMatch(x)) - { result.Add(x); - } else if (!cutAll) - { foreach (var ch in x) - { result.Add(ch.ToString()); - } - } else - { result.Add(x); - } } } } @@ -394,7 +362,7 @@ public void LoadUserDict(string userDictFile) var dictFullPath = Path.GetFullPath(userDictFile); Debug.WriteLine("Initializing user dictionary: " + userDictFile); - lock (locker) + lock (Locker) { if (LoadedPath.Contains(dictFullPath)) return; @@ -425,7 +393,7 @@ public void LoadUserDict(string userDictFile) } catch (IOException e) { - Debug.Fail(string.Format("'{0}' load failure, reason: {1}", dictFullPath, e.Message)); + Debug.Fail($"'{dictFullPath}' load failure, reason: {e.Message}"); } catch (FormatException fe) { @@ -440,6 +408,7 @@ public void AddWord(string word, int freq = 0, string tag = null) { freq = WordDict.SuggestFreq(word, Cut(word, hmm: false)); } + WordDict.AddWord(word, freq); // Add user word tag of POS diff --git a/Segmenter/Pair.cs b/Segmenter/Pair.cs index 59d1d94..bc008d3 100644 --- a/Segmenter/Pair.cs +++ b/Segmenter/Pair.cs @@ -2,7 +2,7 @@ { public class Pair { - public TKey Key { get;set; } + public TKey Key { get; set; } public double Freq { get; set; } public Pair(TKey key, double freq) @@ -16,4 +16,4 @@ public override string ToString() return "Candidate [Key=" + Key + ", Freq=" + Freq + "]"; } } -} +} \ No newline at end of file diff --git a/Segmenter/PosSeg/Pair.cs b/Segmenter/PosSeg/Pair.cs index 6eda19b..fc57b90 100644 --- a/Segmenter/PosSeg/Pair.cs +++ b/Segmenter/PosSeg/Pair.cs @@ -4,6 +4,7 @@ public class Pair { public string Word { get; set; } public string Flag { get; set; } + public Pair(string word, string flag) { Word = word; @@ -12,7 +13,7 @@ public Pair(string word, string flag) public override string ToString() { - return string.Format("{0}/{1}", Word, Flag); + return $"{Word}/{Flag}"; } } -} +} \ No newline at end of file diff --git a/Segmenter/PosSeg/PosSegmenter.cs b/Segmenter/PosSeg/PosSegmenter.cs index 7afbe20..eed7402 100644 --- a/Segmenter/PosSeg/PosSegmenter.cs +++ b/Segmenter/PosSeg/PosSegmenter.cs @@ -18,7 +18,9 @@ public class PosSegmenter #region Regular Expressions - internal static readonly Regex RegexChineseInternal = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); + internal static readonly Regex RegexChineseInternal = + new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); + internal static readonly Regex RegexSkipInternal = new Regex(@"(\r\n|\s)", RegexOptions.Compiled); internal static readonly Regex RegexChineseDetail = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled); @@ -49,7 +51,7 @@ private static void LoadWordTagTab() var tokens = line.Split(' '); if (tokens.Length < 2) { - Debug.Fail(string.Format("Invalid line: {0}", line)); + Debug.Fail($"Invalid line: {line}"); continue; } @@ -61,7 +63,7 @@ private static void LoadWordTagTab() } catch (System.IO.IOException e) { - Debug.Fail(string.Format("Word tag table load failure, reason: {0}", e.Message)); + Debug.Fail($"Word tag table load failure, reason: {e.Message}"); } catch (FormatException fe) { @@ -69,7 +71,7 @@ private static void LoadWordTagTab() } } - private JiebaSegmenter _segmenter; + private readonly JiebaSegmenter _segmenter; public PosSegmenter() { @@ -83,11 +85,9 @@ public PosSegmenter(JiebaSegmenter segmenter) private void CheckNewUserWordTags() { - if (_segmenter.UserWordTagTab.IsNotEmpty()) - { - _wordTagTab.Update(_segmenter.UserWordTagTab); - _segmenter.UserWordTagTab = new Dictionary(); - } + if (!_segmenter.UserWordTagTab.IsNotEmpty()) return; + _wordTagTab.Update(_segmenter.UserWordTagTab); + _segmenter.UserWordTagTab = new Dictionary(); } public IEnumerable Cut(string text, bool hmm = true) @@ -104,13 +104,9 @@ internal IEnumerable CutInternal(string text, bool hmm = true) var blocks = RegexChineseInternal.Split(text); Func> cutMethod = null; if (hmm) - { cutMethod = CutDag; - } else - { cutMethod = CutDagWithoutHmm; - } var tokens = new List(); foreach (var blk in blocks) @@ -135,17 +131,11 @@ internal IEnumerable CutInternal(string text, bool hmm = true) // TODO: each char? var xxs = xx.ToString(); if (RegexNumbers.IsMatch(xxs)) - { tokens.Add(new Pair(xxs, "m")); - } else if (RegexEnglishWords.IsMatch(x)) - { tokens.Add(new Pair(xxs, "eng")); - } else - { tokens.Add(new Pair(xxs, "x")); - } } } } @@ -170,9 +160,7 @@ internal IEnumerable CutDag(string sentence) var y = route[x].Key + 1; var w = sentence.Substring(x, y - x); if (y - x == 1) - { buf += w; - } else { if (buf.Length > 0) @@ -180,15 +168,15 @@ internal IEnumerable CutDag(string sentence) AddBufferToWordList(tokens, buf); buf = string.Empty; } + tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x"))); } + x = y; } if (buf.Length > 0) - { AddBufferToWordList(tokens, buf); - } return tokens; } @@ -204,10 +192,9 @@ internal IEnumerable CutDagWithoutHmm(string sentence) var buf = string.Empty; var n = sentence.Length; - var y = -1; while (x < n) { - y = route[x].Key + 1; + var y = route[x].Key + 1; var w = sentence.Substring(x, y - x); // TODO: char or word? if (RegexEnglishChar.IsMatch(w)) @@ -222,15 +209,14 @@ internal IEnumerable CutDagWithoutHmm(string sentence) tokens.Add(new Pair(buf, "eng")); buf = string.Empty; } + tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x"))); x = y; } } if (buf.Length > 0) - { tokens.Add(new Pair(buf, "eng")); - } return tokens; } @@ -250,21 +236,13 @@ internal IEnumerable CutDetail(string text) var tmp = RegexSkipDetail.Split(blk); foreach (var x in tmp) { - if (!string.IsNullOrWhiteSpace(x)) - { - if (RegexNumbers.IsMatch(x)) - { - tokens.Add(new Pair(x, "m")); - } - else if(RegexEnglishWords.IsMatch(x)) - { - tokens.Add(new Pair(x, "eng")); - } - else - { - tokens.Add(new Pair(x, "x")); - } - } + if (string.IsNullOrWhiteSpace(x)) continue; + if (RegexNumbers.IsMatch(x)) + tokens.Add(new Pair(x, "m")); + else if (RegexEnglishWords.IsMatch(x)) + tokens.Add(new Pair(x, "eng")); + else + tokens.Add(new Pair(x, "x")); } } } diff --git a/Segmenter/PosSeg/Viterbi.cs b/Segmenter/PosSeg/Viterbi.cs index 349815d..3e6301c 100644 --- a/Segmenter/PosSeg/Viterbi.cs +++ b/Segmenter/PosSeg/Viterbi.cs @@ -21,10 +21,7 @@ private Viterbi() } // TODO: synchronized - public static Viterbi Instance - { - get { return Lazy.Value; } - } + public static Viterbi Instance => Lazy.Value; public IEnumerable Cut(string sentence) { @@ -38,24 +35,27 @@ public IEnumerable Cut(string sentence) var parts = posList[i].Split('-'); var charState = parts[0][0]; var pos = parts[1]; - if (charState == 'B') - begin = i; - else if (charState == 'E') - { - tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos)); - next = i + 1; - } - else if (charState == 'S') + switch (charState) { - tokens.Add(new Pair(sentence.Sub(i, i + 1), pos)); - next = i + 1; + case 'B': + begin = i; + break; + case 'E': + tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos)); + next = i + 1; + break; + case 'S': + tokens.Add(new Pair(sentence.Sub(i, i + 1), pos)); + next = i + 1; + break; } } + if (next < sentence.Length) { tokens.Add(new Pair(sentence.Substring(next), posList[next].Split('-')[1])); } - + return tokens; } @@ -138,6 +138,7 @@ private Tuple> ViterbiCut(string sentence) state = y0; } } + v[i][y] = prob; memPath[i][y] = state; } @@ -150,8 +151,9 @@ private Tuple> ViterbiCut(string sentence) foreach (var endPoint in last) { // TODO: compare two very small values; - if (endProb < endPoint.Prob || - (endProb == endPoint.Prob && String.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0)) + if (endProb < endPoint.Prob || + (endProb == endPoint.Prob && + string.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0)) { endProb = endPoint.Prob; endState = endPoint.State; @@ -161,7 +163,7 @@ private Tuple> ViterbiCut(string sentence) var route = new string[sentence.Length]; var n = sentence.Length - 1; var curState = endState; - while(n >= 0) + while (n >= 0) { route[n] = curState; curState = memPath[n][curState]; diff --git a/Segmenter/Segmenter.csproj b/Segmenter/Segmenter.csproj index 9a8b10a..0404426 100644 --- a/Segmenter/Segmenter.csproj +++ b/Segmenter/Segmenter.csproj @@ -55,8 +55,8 @@ - - + + \ No newline at end of file diff --git a/Segmenter/Spelling/SpellChecker.cs b/Segmenter/Spelling/SpellChecker.cs index 518f345..290f970 100644 --- a/Segmenter/Spelling/SpellChecker.cs +++ b/Segmenter/Spelling/SpellChecker.cs @@ -14,7 +14,7 @@ public class SpellChecker : ISpellChecker internal static readonly WordDictionary WordDict = WordDictionary.Instance; internal readonly Trie WordTrie; - internal readonly Dictionary> FirstChars; + internal readonly Dictionary> FirstChars; public SpellChecker() { @@ -24,21 +24,16 @@ public SpellChecker() foreach (var wd in wordDict.Trie) { - if (wd.Value > 0) - { - WordTrie.Insert(wd.Key, wd.Value); - - if (wd.Key.Length >= 2) - { - var second = wd.Key[1]; - var first = wd.Key[0]; - if (!FirstChars.ContainsKey(second)) - { - FirstChars[second] = new HashSet(); - } - FirstChars[second].Add(first); - } - } + if (wd.Value <= 0) continue; + WordTrie.Insert(wd.Key, wd.Value); + + if (wd.Key.Length < 2) continue; + var second = wd.Key[1]; + var first = wd.Key[0]; + if (!FirstChars.ContainsKey(second)) + FirstChars[second] = new HashSet(); + + FirstChars[second].Add(first); } } @@ -46,9 +41,7 @@ internal ISet GetEdits1(string word) { var splits = new List(); for (var i = 0; i <= word.Length; i++) - { - splits.Add(new WordSplit() { Left = word.Substring(0, i), Right = word.Substring(i) }); - } + splits.Add(new WordSplit() {Left = word.Substring(0, i), Right = word.Substring(i)}); var deletes = splits .Where(s => !string.IsNullOrEmpty(s.Right)) @@ -63,20 +56,15 @@ internal ISet GetEdits1(string word) { var firsts = FirstChars[word[1]]; foreach (var first in firsts) - { if (first != word[0]) - { replaces.Add(first + word.Substring(1)); - } - } var node = WordTrie.Root.Children[word[0]]; - for (int i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) + for (var i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) { foreach (var c in node.Children.Keys) - { replaces.Add(word.Substring(0, i) + c + word.Substring(i + 1)); - } + node = node.Children.GetValueOrDefault(word[i]); } } @@ -88,23 +76,17 @@ internal ISet GetEdits1(string word) { var firsts = FirstChars[word[0]]; foreach (var first in firsts) - { inserts.Add(first + word); - } } var node = WordTrie.Root.Children.GetValueOrDefault(word[0]); - for (int i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) + for (var i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) { foreach (var c in node.Children.Keys) - { - inserts.Add(word.Substring(0, i+1) + c + word.Substring(i+1)); - } + inserts.Add(word.Substring(0, i + 1) + c + word.Substring(i + 1)); if (i < word.Length - 1) - { node = node.Children.GetValueOrDefault(word[i + 1]); - } } } @@ -121,9 +103,8 @@ internal ISet GetKnownEdits2(string word) { var result = new HashSet(); foreach (var e1 in GetEdits1(word)) - { result.UnionWith(GetEdits1(e1).Where(e => WordDictionary.Instance.ContainsWord(e))); - } + return result; } @@ -135,16 +116,12 @@ internal ISet GetKnownWords(IEnumerable words) public IEnumerable Suggests(string word) { if (WordDict.ContainsWord(word)) - { return new[] {word}; - } var candicates = GetKnownWords(GetEdits1(word)); if (candicates.IsNotEmpty()) - { return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c)); - } - + candicates.UnionWith(GetKnownEdits2(word)); return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c)); } @@ -155,4 +132,4 @@ internal class WordSplit public string Left { get; set; } public string Right { get; set; } } -} +} \ No newline at end of file diff --git a/Segmenter/Token.cs b/Segmenter/Token.cs index 1083027..5e2ea67 100644 --- a/Segmenter/Token.cs +++ b/Segmenter/Token.cs @@ -15,7 +15,7 @@ public Token(string word, int startIndex, int endIndex) public override string ToString() { - return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex); + return $"[{Word}, ({StartIndex}, {EndIndex})]"; } } } \ No newline at end of file diff --git a/Segmenter/WordDictionary.cs b/Segmenter/WordDictionary.cs index 553e200..45b2466 100644 --- a/Segmenter/WordDictionary.cs +++ b/Segmenter/WordDictionary.cs @@ -15,7 +15,7 @@ public class WordDictionary private static readonly Lazy lazy = new Lazy(() => new WordDictionary()); private static readonly string MainDict = ConfigManager.MainDictFile; - internal IDictionary Trie = new Dictionary(); + internal readonly IDictionary Trie = new Dictionary(); /// /// total occurrence of all words. @@ -30,10 +30,7 @@ private WordDictionary() Debug.WriteLine("total freq: {0}", Total); } - public static WordDictionary Instance - { - get { return lazy.Value; } - } + public static WordDictionary Instance => lazy.Value; private void LoadDict() { @@ -52,7 +49,7 @@ private void LoadDict() var tokens = line.Split(' '); if (tokens.Length < 2) { - Debug.Fail(string.Format("Invalid line: {0}", line)); + Debug.Fail($"Invalid line: {line}"); continue; } @@ -78,7 +75,7 @@ private void LoadDict() } catch (IOException e) { - Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message)); + Debug.Fail($"{MainDict} load failure, reason: {e.Message}"); } catch (FormatException fe) { @@ -93,18 +90,13 @@ public bool ContainsWord(string word) public int GetFreqOrDefault(string key) { - if (ContainsWord(key)) - return Trie[key]; - else - return 1; + return ContainsWord(key) ? Trie[key] : 1; } public void AddWord(string word, int freq, string tag = null) { if (ContainsWord(word)) - { Total -= Trie[word]; - } Trie[word] = freq; Total += freq; @@ -125,13 +117,10 @@ public void DeleteWord(string word) internal int SuggestFreq(string word, IEnumerable segments) { - double freq = 1; - foreach (var seg in segments) - { - freq *= GetFreqOrDefault(seg) / Total; - } + var freq = segments.Aggregate(1, + (current, seg) => current * (GetFreqOrDefault(seg) / Total)); - return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word)); + return Math.Max((int) (freq * Total) + 1, GetFreqOrDefault(word)); } } } \ No newline at end of file diff --git a/TestProject/TestProject.csproj b/TestProject/TestProject.csproj new file mode 100644 index 0000000..6c6743f --- /dev/null +++ b/TestProject/TestProject.csproj @@ -0,0 +1,19 @@ + + + + netcoreapp2.2 + + false + + + + + + + + + + + + + diff --git a/TestProject/UnitTest1.cs b/TestProject/UnitTest1.cs new file mode 100644 index 0000000..3b8fe04 --- /dev/null +++ b/TestProject/UnitTest1.cs @@ -0,0 +1,171 @@ +using System; +using System.Linq; +using JiebaNet.Analyser; +using JiebaNet.Segmenter; +using JiebaNet.Segmenter.Common; +using JiebaNet.Segmenter.PosSeg; +using Xunit; +using Xunit.Abstractions; + +namespace TestProject +{ + public class UnitTest1 + { + private readonly ITestOutputHelper _testOutputHelper; + + public UnitTest1(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + } + + [Fact] + public void Test1() + { + var segmenter = new JiebaSegmenter(); + segmenter.LoadUserDict("userdict.txt"); + var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); + Assert.Equal("【全模式】:我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学", $"【全模式】:{string.Join("/ ", segments)}"); + segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 + Assert.Equal("【精确模式】:我/ 来到/ 北京/ 清华大学", $"【精确模式】:{string.Join("/ ", segments)}"); + + segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 + Assert.Equal("【新词识别】:他/ 来到/ 了/ 网易/ 杭研/ 大厦", $"【新词识别】:{string.Join("/ ", segments)}"); + + segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 + Assert.Equal("【搜索引擎模式】:小明/ 硕士/ 毕业/ 于/ 中国/ 科学/ 学院/ 科学院/ 中国科学院/ 计算/ 计算所/ ,/ 后/ 在/ 日本/ 京都/ 大学/ 日本京都大学/ 深造", + $"【搜索引擎模式】:{string.Join("/ ", segments)}"); + + segments = segmenter.Cut("结过婚的和尚未结过婚的"); + Assert.Equal("【歧义消除】:结过婚/ 的/ 和/ 尚未/ 结过婚/ 的", $"【歧义消除】:{string.Join("/ ", segments)}"); + + segments = segmenter.Cut("linezerodemo机器学习学习机器"); + Assert.Equal("【用户字典】:linezero/ demo/ 机器学习/ 学习/ 机器", $"【用户字典】:{string.Join("/ ", segments)}"); + } + + [Fact] + public void Test2() + { + var segmenter = new JiebaSegmenter(); + //词频统计 + var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; + var freqs = new Counter(segmenter.Cut(s)); + foreach (var (key, value) in freqs.MostCommon(5)) + { + _testOutputHelper.WriteLine($"{key}: {value}"); + } + } + + [Fact] + public void CutDemo() + { + var segmenter = new JiebaSegmenter(); + var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); + Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 + Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 + Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); + + segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 + Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("结过婚的和尚未结过婚的"); + Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("北京大学生喝进口红酒"); + Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("在北京大学生活区喝进口红酒"); + Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments)); + + segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验"); + Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); + + segmenter.DeleteWord("湖南"); + segmenter.AddWord("湖南"); + //segmenter.AddWord("长沙市"); + segments = segmenter.Cut("湖南长沙市天心区"); + Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); + } + + [Fact] + public void TokenizeDemo() + { + var segmenter = new JiebaSegmenter(); + var s = "永和服装饰品有限公司"; + var tokens = segmenter.Tokenize(s); + foreach (var token in tokens) + { + Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, + token.EndIndex); + } + } + + [Fact] + public void TokenizeSearchDemo() + { + var segmenter = new JiebaSegmenter(); + var s = "永和服装饰品有限公司"; + var tokens = segmenter.Tokenize(s, TokenizerMode.Search); + foreach (var token in tokens) + { + Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, + token.EndIndex); + } + } + + [Fact] + public void PosCutDemo() + { + var posSeg = new PosSegmenter(); + var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移"; + + var tokens = posSeg.Cut(s); + Console.WriteLine( + string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)))); + } + + [Fact] + public void ExtractTagsDemo() + { + var text = + "程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。"; + var extractor = new TfidfExtractor(); + var keywords = extractor.ExtractTags(text); + foreach (var keyword in keywords) + { + Console.WriteLine(keyword); + } + } + + [Fact] + public void ExtractTagsDemo2() + { + var text = + @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。 + 算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。 + 形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。"; + + var extractor = new TfidfExtractor(); + var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos); + foreach (var keyword in keywords) + { + Console.WriteLine(keyword); + } + } + + [Fact] + public void TestWordFreq() + { + var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; + var seg = new JiebaSegmenter(); + var freqs = new Counter(seg.Cut(s)); + foreach (var pair in freqs.MostCommon()) + { + Console.WriteLine($"{pair.Key}: {pair.Value}"); + } + } + } +} \ No newline at end of file diff --git a/jieba.NET/Program.cs b/jieba.NET/Program.cs index 3b6c0a5..4d6562a 100644 --- a/jieba.NET/Program.cs +++ b/jieba.NET/Program.cs @@ -32,9 +32,9 @@ static void Main(string[] args) //词频统计 var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; var freqs = new Counter(segmenter.Cut(s)); - foreach (var pair in freqs.MostCommon(5)) + foreach (var (key, value) in freqs.MostCommon(5)) { - Console.WriteLine($"{pair.Key}: {pair.Value}"); + Console.WriteLine($"{key}: {value}"); } //new TestDemo().CutDemo(); diff --git a/jieba.NET/TestDemo.cs b/jieba.NET/TestDemo.cs index 7fd49a7..b2abc20 100644 --- a/jieba.NET/TestDemo.cs +++ b/jieba.NET/TestDemo.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using JiebaNet.Analyser; using JiebaNet.Segmenter.PosSeg; @@ -16,10 +17,10 @@ public void CutDemo() var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments)); - segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 + segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); - segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 + segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments)); segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 @@ -58,7 +59,8 @@ public void TokenizeDemo() var tokens = segmenter.Tokenize(s); foreach (var token in tokens) { - Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex); + Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, + token.EndIndex); } } @@ -70,7 +72,8 @@ public void TokenizeSearchDemo() var tokens = segmenter.Tokenize(s, TokenizerMode.Search); foreach (var token in tokens) { - Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex); + Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, + token.EndIndex); } } @@ -81,7 +84,8 @@ public void PosCutDemo() var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移"; var tokens = posSeg.Cut(s); - Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)))); + Console.WriteLine( + string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)))); } @@ -89,7 +93,7 @@ public void ExtractTagsDemo() { var text = "程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。"; - var extractor = new TfidfExtractor(); + var extractor = new TfidfExtractor(new HashSet(), new Dictionary()); var keywords = extractor.ExtractTags(text); foreach (var keyword in keywords) { @@ -100,11 +104,12 @@ public void ExtractTagsDemo() public void ExtractTagsDemo2() { - var text = @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。 + var text = + @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。 算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。 形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。"; - var extractor = new TfidfExtractor(); + var extractor = new TfidfExtractor(new HashSet(), new Dictionary()); var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos); foreach (var keyword in keywords) { @@ -117,9 +122,9 @@ public void TestWordFreq() var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"; var seg = new JiebaSegmenter(); var freqs = new Counter(seg.Cut(s)); - foreach (var pair in freqs.MostCommon()) + foreach (var (key, value) in freqs.MostCommon()) { - Console.WriteLine($"{pair.Key}: {pair.Value}"); + Console.WriteLine($"{key}: {value}"); } } }