diff --git a/Analyser/Analyser.csproj b/Analyser/Analyser.csproj
index 0eadbfc..2224687 100644
--- a/Analyser/Analyser.csproj
+++ b/Analyser/Analyser.csproj
@@ -15,6 +15,20 @@
true
+
+
+
+
+
+
+
+ Never
+
+
+ Never
+
+
+
diff --git a/Analyser/ConfigManager.cs b/Analyser/ConfigManager.cs
index 8539534..3cee865 100644
--- a/Analyser/ConfigManager.cs
+++ b/Analyser/ConfigManager.cs
@@ -1,27 +1,12 @@
using System.IO;
-using System;
namespace JiebaNet.Analyser
{
public class ConfigManager
{
// TODO: duplicate codes.
- public static string ConfigFileBaseDir
- {
- get
- {
- return "Resources";
- }
- }
+ public static string ConfigFileBaseDir => "Resources";
- public static string IdfFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "idf.txt"); }
- }
-
- public static string StopWordsFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "stopwords.txt"); }
- }
+ public static string IdfFile => Path.Combine(ConfigFileBaseDir, "idf.txt");
}
}
\ No newline at end of file
diff --git a/Analyser/IdfLoader.cs b/Analyser/IdfLoader.cs
index 6cdce2a..1795ec3 100644
--- a/Analyser/IdfLoader.cs
+++ b/Analyser/IdfLoader.cs
@@ -1,47 +1,25 @@
using JiebaNet.Segmenter.Common;
using System.Collections.Generic;
-using System.IO;
using System.Linq;
-using System.Reflection;
using System.Text;
namespace JiebaNet.Analyser
{
public class IdfLoader
{
- internal string IdfFilePath { get; set; }
internal IDictionary IdfFreq { get; set; }
internal double MedianIdf { get; set; }
- public IdfLoader(string idfPath = null)
+ public IdfLoader(IDictionary idfFreq)
{
- IdfFilePath = string.Empty;
- IdfFreq = new Dictionary();
- MedianIdf = 0.0;
- if (!string.IsNullOrWhiteSpace(idfPath))
- {
- SetNewPath(idfPath);
- }
+ IdfFreq = idfFreq ?? new Dictionary();
+ MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}
- public void SetNewPath(string newIdfPath)
+ public void SetNewPath(IDictionary newIdfFreq)
{
- var idfPath = newIdfPath;
- if (IdfFilePath != idfPath)
- {
- IdfFilePath = idfPath;
- var lines = FileExtension.ReadEmbeddedAllLines(idfPath, Encoding.UTF8);
- IdfFreq = new Dictionary();
- foreach (var line in lines)
- {
- var parts = line.Trim().Split(' ');
- var word = parts[0];
- var freq = double.Parse(parts[1]);
- IdfFreq[word] = freq;
- }
-
- MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
- }
+ IdfFreq = IdfFreq?.Union(newIdfFreq).ToDictionary(k => k.Key, v => v.Value) ?? newIdfFreq;
+ MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}
}
}
\ No newline at end of file
diff --git a/Analyser/KeywordExtractor.cs b/Analyser/KeywordExtractor.cs
index f428043..db1b92b 100644
--- a/Analyser/KeywordExtractor.cs
+++ b/Analyser/KeywordExtractor.cs
@@ -1,7 +1,4 @@
-using JiebaNet.Segmenter.Common;
-using Microsoft.Extensions.FileProviders;
-using System.Collections.Generic;
-using System.IO;
+using System.Collections.Generic;
namespace JiebaNet.Analyser
{
@@ -14,35 +11,28 @@ public abstract class KeywordExtractor
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
};
- protected virtual ISet StopWords { get; set; }
-
- public void SetStopWords(string stopWordsFile)
+ protected ISet StopWords { get; set; }
+ public void SetStopWords(ISet stopWords)
{
- StopWords = new HashSet();
- var lines = FileExtension.ReadEmbeddedAllLines(stopWordsFile);
- foreach (var line in lines)
- {
- StopWords.Add(line.Trim());
- }
+ StopWords = stopWords ?? new HashSet();
}
public void AddStopWord(string word)
{
if (!StopWords.Contains(word))
- {
StopWords.Add(word.Trim());
- }
}
public void AddStopWords(IEnumerable words)
{
foreach (var word in words)
- {
AddStopWord(word);
- }
}
- public abstract IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null);
- public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null);
+ public abstract IEnumerable ExtractTags(string text, int count = 20,
+ IEnumerable allowPos = null);
+
+ public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20,
+ IEnumerable allowPos = null);
}
}
\ No newline at end of file
diff --git a/Analyser/TextRankExtractor.cs b/Analyser/TextRankExtractor.cs
index 4d96e57..d65eb35 100644
--- a/Analyser/TextRankExtractor.cs
+++ b/Analyser/TextRankExtractor.cs
@@ -25,31 +25,39 @@ public bool PairFilter(IEnumerable allowPos, Pair wp)
&& !StopWords.Contains(wp.Word.ToLower());
}
- public TextRankExtractor()
+ public TextRankExtractor(ISet stopWords)
{
Span = 5;
Segmenter = new JiebaSegmenter();
PosSegmenter = new PosSegmenter(Segmenter);
- SetStopWords(ConfigManager.StopWordsFile);
+ SetStopWords(stopWords);
if (StopWords.IsEmpty())
- {
StopWords.UnionWith(DefaultStopWords);
- }
}
- public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null)
+ public override IEnumerable ExtractTags(string text, int count = 20,
+ IEnumerable allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
- if (count <= 0) { count = 20; }
+ if (count <= 0)
+ {
+ count = 20;
+ }
+
return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}
- public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null)
+ public override IEnumerable ExtractTagsWithWeight(string text, int count = 20,
+ IEnumerable allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
- if (count <= 0) { count = 20; }
- return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
+ if (count <= 0)
+ {
+ count = 20;
+ }
+
+ return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair
{
Word = p.Key, Weight = p.Value
}).Take(count);
@@ -60,9 +68,7 @@ public override IEnumerable ExtractTagsWithWeight(string text, i
private IDictionary ExtractTagRank(string text, IEnumerable allowPos)
{
if (allowPos.IsEmpty())
- {
allowPos = DefaultPosFilter;
- }
var g = new UndirectWeightedGraph();
var cm = new Dictionary();
@@ -71,27 +77,19 @@ private IDictionary ExtractTagRank(string text, IEnumerable= words.Count)
- {
- break;
- }
- if (!PairFilter(allowPos, words[j]))
- {
- continue;
- }
-
- // TODO: better separator.
- var key = wp.Word + "$" + words[j].Word;
- if (!cm.ContainsKey(key))
- {
- cm[key] = 0;
- }
- cm[key] += 1;
- }
+ if (j >= words.Count)
+ break;
+ if (!PairFilter(allowPos, words[j]))
+ continue;
+
+ // TODO: better separator.
+ var key = wp.Word + "$" + words[j].Word;
+ if (!cm.ContainsKey(key))
+ cm[key] = 0;
+ cm[key] += 1;
}
}
diff --git a/Analyser/TfidfExtractor.cs b/Analyser/TfidfExtractor.cs
index fe33e38..4b58e42 100644
--- a/Analyser/TfidfExtractor.cs
+++ b/Analyser/TfidfExtractor.cs
@@ -1,5 +1,4 @@
-using System;
-using System.Collections.Generic;
+using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter;
using JiebaNet.Segmenter.Common;
@@ -10,7 +9,7 @@ namespace JiebaNet.Analyser
public class TfidfExtractor : KeywordExtractor
{
private static readonly string DefaultIdfFile = ConfigManager.IdfFile;
- private static readonly int DefaultWordCount = 20;
+ private const int DefaultWordCount = 20;
private JiebaSegmenter Segmenter { get; set; }
private PosSegmenter PosSegmenter { get; set; }
@@ -19,32 +18,24 @@ public class TfidfExtractor : KeywordExtractor
private IDictionary IdfFreq { get; set; }
private double MedianIdf { get; set; }
- public TfidfExtractor(JiebaSegmenter segmenter = null)
+ public TfidfExtractor(ISet stopWords, IDictionary idfFreq,
+ JiebaSegmenter segmenter = null)
{
- if (segmenter.IsNull())
- {
- Segmenter = new JiebaSegmenter();
- }
- else
- {
- Segmenter = segmenter;
- }
+ Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter;
PosSegmenter = new PosSegmenter(Segmenter);
- SetStopWords(ConfigManager.StopWordsFile);
+ SetStopWords(stopWords);
if (StopWords.IsEmpty())
- {
StopWords.UnionWith(DefaultStopWords);
- }
- Loader = new IdfLoader(DefaultIdfFile);
+ Loader = new IdfLoader(idfFreq);
IdfFreq = Loader.IdfFreq;
MedianIdf = Loader.MedianIdf;
}
- public void SetIdfPath(string idfPath)
+ public void SetIdfPath(IDictionary idfFreq)
{
- Loader.SetNewPath(idfPath);
+ Loader.SetNewPath(idfFreq);
IdfFreq = Loader.IdfFreq;
MedianIdf = Loader.MedianIdf;
}
@@ -58,14 +49,7 @@ private IEnumerable FilterCutByPos(string text, IEnumerable allo
private IDictionary GetWordIfidf(string text, IEnumerable allowPos)
{
IEnumerable words = null;
- if (allowPos.IsNotEmpty())
- {
- words = FilterCutByPos(text, allowPos);
- }
- else
- {
- words = Segmenter.Cut(text);
- }
+ words = allowPos.IsNotEmpty() ? FilterCutByPos(text, allowPos) : Segmenter.Cut(text);
// Calculate TF
var freq = new Dictionary();
@@ -73,31 +57,32 @@ private IDictionary GetWordIfidf(string text, IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null)
+ public override IEnumerable ExtractTags(string text, int count = 20,
+ IEnumerable allowPos = null)
{
- if (count <= 0) { count = DefaultWordCount; }
+ if (count <= 0)
+ count = DefaultWordCount;
var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}
- public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null)
+ public override IEnumerable ExtractTagsWithWeight(string text, int count = 20,
+ IEnumerable allowPos = null)
{
- if (count <= 0) { count = DefaultWordCount; }
+ if (count <= 0)
+ count = DefaultWordCount;
var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
@@ -112,4 +97,4 @@ public class WordWeightPair
public string Word { get; set; }
public double Weight { get; set; }
}
-}
+}
\ No newline at end of file
diff --git a/Analyser/UndirectWeightedGraph.cs b/Analyser/UndirectWeightedGraph.cs
index b1afd0c..fe622cc 100644
--- a/Analyser/UndirectWeightedGraph.cs
+++ b/Analyser/UndirectWeightedGraph.cs
@@ -1,4 +1,3 @@
-using System;
using System.Collections.Generic;
using System.Linq;
@@ -13,9 +12,10 @@ public class Edge
public class UndirectWeightedGraph
{
- private static readonly double d = 0.85;
+ private const double D = 0.85;
+
+ public IDictionary> Graph { get; set; }
- public IDictionary> Graph { get; set; }
public UndirectWeightedGraph()
{
Graph = new Dictionary>();
@@ -24,17 +24,13 @@ public UndirectWeightedGraph()
public void AddEdge(string start, string end, double weight)
{
if (!Graph.ContainsKey(start))
- {
Graph[start] = new List();
- }
if (!Graph.ContainsKey(end))
- {
Graph[end] = new List();
- }
- Graph[start].Add(new Edge(){ Start = start, End = end, Weight = weight });
- Graph[end].Add(new Edge(){ Start = end, End = start, Weight = weight });
+ Graph[start].Add(new Edge() {Start = start, End = end, Weight = weight});
+ Graph[end].Add(new Edge() {Start = end, End = start, Weight = weight});
}
public IDictionary Rank()
@@ -44,7 +40,7 @@ public IDictionary Rank()
// init scores
var count = Graph.Count > 0 ? Graph.Count : 1;
- var wsdef = 1.0/count;
+ var wsdef = 1.0 / count;
foreach (var pair in Graph)
{
@@ -60,10 +56,8 @@ public IDictionary Rank()
{
var s = 0d;
foreach (var edge in Graph[n])
- {
- s += edge.Weight/outSum[edge.End]*ws[edge.End];
- }
- ws[n] = (1 - d) + d*s;
+ s += edge.Weight / outSum[edge.End] * ws[edge.End];
+ ws[n] = (1 - D) + D * s;
}
}
@@ -73,19 +67,13 @@ public IDictionary Rank()
foreach (var w in ws.Values)
{
if (w < minRank)
- {
minRank = w;
- }
- if(w > maxRank)
- {
+ if (w > maxRank)
maxRank = w;
- }
}
foreach (var pair in ws.ToList())
- {
- ws[pair.Key] = (pair.Value - minRank/10.0)/(maxRank - minRank/10.0);
- }
+ ws[pair.Key] = (pair.Value - minRank / 10.0) / (maxRank - minRank / 10.0);
return ws;
}
diff --git a/Segmenter/Common/Counter.cs b/Segmenter/Common/Counter.cs
index 692613b..1feb523 100644
--- a/Segmenter/Common/Counter.cs
+++ b/Segmenter/Common/Counter.cs
@@ -1,5 +1,4 @@
-using System;
-using System.Collections.Generic;
+using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter.Common
@@ -53,30 +52,32 @@ public interface ICounter
bool Contains(T key);
}
- public class Counter: ICounter
+ public class Counter : ICounter
{
- private Dictionary data = new Dictionary();
+ private readonly Dictionary _data = new Dictionary();
- public Counter() {}
+ private Counter()
+ {
+ }
public Counter(IEnumerable items)
{
CountItems(items);
}
- public int Count => data.Count;
- public int Total => data.Values.Sum();
- public IEnumerable> Elements => data;
+ public int Count => _data.Count;
+ public int Total => _data.Values.Sum();
+ public IEnumerable> Elements => _data;
public int this[T key]
{
- get => data.ContainsKey(key) ? data[key] : 0;
- set => data[key] = value;
+ get => _data.ContainsKey(key) ? _data[key] : 0;
+ set => _data[key] = value;
}
public IEnumerable> MostCommon(int n = -1)
{
- var pairs = data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value);
+ var pairs = _data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value);
return n < 0 ? pairs : pairs.Take(n);
}
@@ -103,7 +104,7 @@ public void Add(ICounter other)
public ICounter Union(ICounter other)
{
var result = new Counter();
- foreach (var pair in data)
+ foreach (var pair in _data)
{
var count = pair.Value;
var otherCount = other[pair.Key];
@@ -112,31 +113,26 @@ public ICounter Union(ICounter other)
}
foreach (var pair in other.Elements)
- {
if (!Contains(pair.Key))
- {
result[pair.Key] = pair.Value;
- }
- }
+
return result;
}
public void Remove(T key)
{
- if (data.ContainsKey(key))
- {
- data.Remove(key);
- }
+ if (_data.ContainsKey(key))
+ _data.Remove(key);
}
public void Clear()
{
- data.Clear();
+ _data.Clear();
}
public bool Contains(T key)
{
- return data.ContainsKey(key);
+ return _data.ContainsKey(key);
}
#region Private Methods
@@ -144,35 +140,27 @@ public bool Contains(T key)
private void CountItems(IEnumerable items)
{
foreach (var item in items)
- {
- data[item] = data.GetDefault(item, 0) + 1;
- }
+ _data[item] = _data.GetDefault(item, 0) + 1;
}
private void CountPairs(IEnumerable> pairs)
{
foreach (var pair in pairs)
- {
this[pair.Key] += pair.Value;
- }
}
private void SubtractItems(IEnumerable items)
{
foreach (var item in items)
- {
- data[item] = data.GetDefault(item, 0) - 1;
- }
+ _data[item] = _data.GetDefault(item, 0) - 1;
}
private void SubtractPairs(IEnumerable> pairs)
{
foreach (var pair in pairs)
- {
this[pair.Key] -= pair.Value;
- }
}
#endregion
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/Common/Extensions.cs b/Segmenter/Common/Extensions.cs
index 5da7888..7ca6685 100644
--- a/Segmenter/Common/Extensions.cs
+++ b/Segmenter/Common/Extensions.cs
@@ -1,5 +1,4 @@
-using System;
-using System.Collections.Generic;
+using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
@@ -24,7 +23,6 @@ public static bool IsNotNull(this object obj)
#endregion
-
#region Enumerable
public static bool IsEmpty(this IEnumerable enumerable)
@@ -42,21 +40,16 @@ public static TValue GetValueOrDefault(this IDictionary(this IDictionary dict, TKey key, TValue defaultValue)
+ public static TValue GetDefault(this IDictionary dict, TKey key,
+ TValue defaultValue)
{
- if (dict.ContainsKey(key))
- {
- return dict[key];
- }
- return defaultValue;
+ return dict.ContainsKey(key) ? dict[key] : defaultValue;
}
public static void Update(this IDictionary dict, IDictionary other)
{
foreach (var key in other.Keys)
- {
dict[key] = other[key];
- }
}
#endregion
@@ -65,23 +58,12 @@ public static void Update(this IDictionary dict, IDi
public static string Left(this string s, int endIndex)
{
- if (string.IsNullOrEmpty(s))
- {
- return s;
- }
-
- return s.Substring(0, endIndex);
+ return string.IsNullOrEmpty(s) ? s : s.Substring(0, endIndex);
}
public static string Right(this string s, int startIndex)
{
- if (string.IsNullOrEmpty(s))
- {
- return s;
- }
-
-
- return s.Substring(startIndex);
+ return string.IsNullOrEmpty(s) ? s : s.Substring(startIndex);
}
public static string Sub(this string s, int startIndex, int endIndex)
@@ -93,7 +75,7 @@ public static bool IsInt32(this string s)
{
return RegexDigits.IsMatch(s);
}
-
+
public static string[] SplitLines(this string s)
{
return RegexNewline.Split(s);
@@ -107,7 +89,7 @@ public static string Join(this IEnumerable inputs, string separator = ",
public static IEnumerable SubGroupValues(this GroupCollection groups)
{
var result = from Group g in groups
- select g.Value;
+ select g.Value;
return result.Skip(1);
}
@@ -122,7 +104,7 @@ public static int ToInt32(this char ch)
public static char ToChar(this int i)
{
- return (char)i;
+ return (char) i;
}
#endregion
diff --git a/Segmenter/Common/FileExtension.cs b/Segmenter/Common/FileExtension.cs
index 1dd0157..f70e244 100644
--- a/Segmenter/Common/FileExtension.cs
+++ b/Segmenter/Common/FileExtension.cs
@@ -1,5 +1,4 @@
using Microsoft.Extensions.FileProviders;
-using System;
using System.Collections.Generic;
using System.IO;
using System.Reflection;
@@ -14,29 +13,26 @@ public static string ReadEmbeddedAllLine(string path)
return ReadEmbeddedAllLine(path, Encoding.UTF8);
}
- public static string ReadEmbeddedAllLine(string path,Encoding encoding)
+ public static string ReadEmbeddedAllLine(string path, Encoding encoding)
{
var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly);
var fileInfo = provider.GetFileInfo(path);
using (var sr = new StreamReader(fileInfo.CreateReadStream(), encoding))
- {
return sr.ReadToEnd();
- }
}
public static List ReadEmbeddedAllLines(string path, Encoding encoding)
{
var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly);
var fileInfo = provider.GetFileInfo(path);
- List list = new List();
- using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding))
+ var list = new List();
+ using (var streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding))
{
string item;
while ((item = streamReader.ReadLine()) != null)
- {
list.Add(item);
- }
}
+
return list;
}
@@ -45,4 +41,4 @@ public static List ReadEmbeddedAllLines(string path)
return ReadEmbeddedAllLines(path, Encoding.UTF8);
}
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/Common/Trie.cs b/Segmenter/Common/Trie.cs
index 3e8c2a5..06b7492 100644
--- a/Segmenter/Common/Trie.cs
+++ b/Segmenter/Common/Trie.cs
@@ -15,7 +15,7 @@ public TrieNode(char ch)
{
Char = ch;
Frequency = 0;
-
+
// TODO: or an empty dict?
//Children = null;
}
@@ -23,48 +23,33 @@ public TrieNode(char ch)
public int Insert(string s, int pos, int freq = 1)
{
if (string.IsNullOrEmpty(s) || pos >= s.Length)
- {
return 0;
- }
if (Children == null)
- {
Children = new Dictionary();
- }
var c = s[pos];
if (!Children.ContainsKey(c))
- {
Children[c] = new TrieNode(c);
- }
var curNode = Children[c];
- if (pos == s.Length - 1)
- {
- curNode.Frequency += freq;
- return curNode.Frequency;
- }
+ if (pos != s.Length - 1) return curNode.Insert(s, pos + 1, freq);
+ curNode.Frequency += freq;
+ return curNode.Frequency;
- return curNode.Insert(s, pos + 1, freq);
}
public TrieNode Search(string s, int pos)
{
if (string.IsNullOrEmpty(s))
- {
return null;
- }
// if out of range or without any child nodes
if (pos >= s.Length || Children == null)
- {
return null;
- }
// if reaches the last char of s, it's time to make the decision.
if (pos == s.Length - 1)
- {
return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null;
- }
// continue if necessary.
return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null;
}
@@ -75,7 +60,9 @@ public interface ITrie
//string BestMatch(string word, long maxTime);
bool Contains(string word);
int Frequency(string word);
+
int Insert(string word, int freq = 1);
+
//bool Remove(string word);
int Count { get; }
int TotalFrequency { get; }
@@ -83,9 +70,9 @@ public interface ITrie
public class Trie : ITrie
{
- private static readonly char RootChar = '\0';
+ private const char RootChar = '\0';
- internal TrieNode Root;
+ internal readonly TrieNode Root;
public int Count { get; private set; }
public int TotalFrequency { get; private set; }
@@ -125,11 +112,9 @@ public int Insert(string word, int freq = 1)
CheckWord(word);
var i = Root.Insert(word.Trim(), 0, freq);
- if (i > 0)
- {
- TotalFrequency += freq;
- Count++;
- }
+ if (i <= 0) return i;
+ TotalFrequency += freq;
+ Count++;
return i;
}
@@ -143,9 +128,7 @@ public IEnumerable ChildChars(string prefix)
private void CheckWord(string word)
{
if (string.IsNullOrWhiteSpace(word))
- {
throw new ArgumentException("word must not be null or whitespace");
- }
}
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/ConfigManager.cs b/Segmenter/ConfigManager.cs
index af2f9a0..803e09e 100644
--- a/Segmenter/ConfigManager.cs
+++ b/Segmenter/ConfigManager.cs
@@ -1,5 +1,4 @@
-using System;
-using System.IO;
+using System.IO;
namespace JiebaNet.Segmenter
{
@@ -9,44 +8,23 @@ public static string ConfigFileBaseDir
{
get
{
- var configFileDir = "Resources";
+ const string configFileDir = "Resources";
return configFileDir;
}
}
- public static string MainDictFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); }
- }
+ public static string MainDictFile => Path.Combine(ConfigFileBaseDir, "dict.txt");
- public static string ProbTransFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); }
- }
+ public static string ProbTransFile => Path.Combine(ConfigFileBaseDir, "prob_trans.json");
- public static string ProbEmitFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); }
- }
+ public static string ProbEmitFile => Path.Combine(ConfigFileBaseDir, "prob_emit.json");
- public static string PosProbStartFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); }
- }
+ public static string PosProbStartFile => Path.Combine(ConfigFileBaseDir, "pos_prob_start.json");
- public static string PosProbTransFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); }
- }
+ public static string PosProbTransFile => Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json");
- public static string PosProbEmitFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); }
- }
+ public static string PosProbEmitFile => Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json");
- public static string CharStateTabFile
- {
- get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); }
- }
+ public static string CharStateTabFile => Path.Combine(ConfigFileBaseDir, "char_state_tab.json");
}
}
\ No newline at end of file
diff --git a/Segmenter/Constants.cs b/Segmenter/Constants.cs
index 4315161..99be134 100644
--- a/Segmenter/Constants.cs
+++ b/Segmenter/Constants.cs
@@ -5,11 +5,13 @@ namespace JiebaNet.Segmenter
{
public class Constants
{
- public static readonly double MinProb = -3.14e100;
+ public const double MinProb = -3.14e100;
- public static readonly List NounPos = new List() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" };
- public static readonly List VerbPos = new List() { "v", "vd", "vg", "vi", "vn", "vq" };
+ public static readonly List NounPos = new List()
+ {"n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz"};
+
+ public static readonly List VerbPos = new List() {"v", "vd", "vg", "vi", "vn", "vq"};
public static readonly List NounAndVerbPos = NounPos.Union(VerbPos).ToList();
- public static readonly List IdiomPos = new List() { "i" };
+ public static readonly List IdiomPos = new List() {"i"};
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/DefaultDictionary.cs b/Segmenter/DefaultDictionary.cs
index 45bac66..e6690ae 100644
--- a/Segmenter/DefaultDictionary.cs
+++ b/Segmenter/DefaultDictionary.cs
@@ -1,8 +1,4 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
+using System.Collections.Generic;
namespace JiebaNet.Segmenter
{
@@ -16,9 +12,10 @@ public class DefaultDictionary : Dictionary
{
Add(key, default(TValue));
}
+
return base[key];
}
- set { base[key] = value; }
+ set => base[key] = value;
}
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/FinalSeg/IFinalSeg.cs b/Segmenter/FinalSeg/IFinalSeg.cs
index ed88037..f8e6d04 100644
--- a/Segmenter/FinalSeg/IFinalSeg.cs
+++ b/Segmenter/FinalSeg/IFinalSeg.cs
@@ -1,10 +1,9 @@
-using System;
using System.Collections.Generic;
namespace JiebaNet.Segmenter.FinalSeg
{
///
- /// ڴʵз֮ʹô˽ӿڽз֣ĬʵΪHMM
+ /// 在词典切分之后,使用此接口进行切分,默认实现为HMM方法。
///
public interface IFinalSeg
{
diff --git a/Segmenter/FinalSeg/Viterbi.cs b/Segmenter/FinalSeg/Viterbi.cs
index ca78aa6..7e3df38 100644
--- a/Segmenter/FinalSeg/Viterbi.cs
+++ b/Segmenter/FinalSeg/Viterbi.cs
@@ -2,7 +2,6 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
-using System.Text;
using System.Text.RegularExpressions;
using JiebaNet.Segmenter.Common;
using Newtonsoft.Json;
@@ -12,7 +11,7 @@ namespace JiebaNet.Segmenter.FinalSeg
public class Viterbi : IFinalSeg
{
private static readonly Lazy Lazy = new Lazy(() => new Viterbi());
- private static readonly char[] States = { 'B', 'M', 'E', 'S' };
+ private static readonly char[] States = {'B', 'M', 'E', 'S'};
private static readonly Regex RegexChinese = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
private static readonly Regex RegexSkip = new Regex(@"([a-zA-Z0-9]+(?:\.\d+)?%?)", RegexOptions.Compiled);
@@ -28,10 +27,7 @@ private Viterbi()
}
// TODO: synchronized
- public static Viterbi Instance
- {
- get { return Lazy.Value; }
- }
+ public static Viterbi Instance => Lazy.Value;
public IEnumerable Cut(string sentence)
{
@@ -39,15 +35,14 @@ public IEnumerable Cut(string sentence)
foreach (var blk in RegexChinese.Split(sentence))
{
if (RegexChinese.IsMatch(blk))
- {
tokens.AddRange(ViterbiCut(blk));
- }
else
{
var segments = RegexSkip.Split(blk).Where(seg => !string.IsNullOrEmpty(seg));
tokens.AddRange(segments);
}
}
+
return tokens;
}
@@ -60,10 +55,10 @@ private void LoadModel()
_prevStatus = new Dictionary()
{
- {'B', new []{'E', 'S'}},
- {'M', new []{'M', 'B'}},
- {'S', new []{'S', 'E'}},
- {'E', new []{'B', 'M'}}
+ {'B', new[] {'E', 'S'}},
+ {'M', new[] {'M', 'B'}},
+ {'S', new[] {'S', 'E'}},
+ {'E', new[] {'B', 'M'}}
};
_startProbs = new Dictionary()
@@ -108,20 +103,20 @@ private IEnumerable ViterbiCut(string sentence)
{
var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb);
- Pair candidate = new Pair('\0', double.MinValue);
+ var candidate = new Pair('\0', double.MinValue);
foreach (var y0 in _prevStatus[y])
{
var tranp = _transProbs[y0].GetDefault(y, Constants.MinProb);
tranp = v[i - 1][y0] + tranp + emp;
- if (candidate.Freq <= tranp)
- {
- candidate.Freq = tranp;
- candidate.Key = y0;
- }
+ if (!(candidate.Freq <= tranp)) continue;
+ candidate.Freq = tranp;
+ candidate.Key = y0;
}
+
vv[y] = candidate.Freq;
newPath[y] = new Node(y, path[candidate.Key]);
}
+
path = newPath;
}
@@ -135,6 +130,7 @@ private IEnumerable ViterbiCut(string sentence)
posList.Add(finalPath.Value);
finalPath = finalPath.Parent;
}
+
posList.Reverse();
var tokens = new List();
@@ -142,23 +138,24 @@ private IEnumerable ViterbiCut(string sentence)
for (var i = 0; i < sentence.Length; i++)
{
var pos = posList[i];
- if (pos == 'B')
- begin = i;
- else if (pos == 'E')
- {
- tokens.Add(sentence.Sub(begin, i + 1));
- next = i + 1;
- }
- else if (pos == 'S')
+ switch (pos)
{
- tokens.Add(sentence.Sub(i, i + 1));
- next = i + 1;
+ case 'B':
+ begin = i;
+ break;
+ case 'E':
+ tokens.Add(sentence.Sub(begin, i + 1));
+ next = i + 1;
+ break;
+ case 'S':
+ tokens.Add(sentence.Sub(i, i + 1));
+ next = i + 1;
+ break;
}
}
+
if (next < sentence.Length)
- {
tokens.Add(sentence.Substring(next));
- }
return tokens;
}
diff --git a/Segmenter/JiebaSegmenter.cs b/Segmenter/JiebaSegmenter.cs
index 98bd7b3..18787e4 100644
--- a/Segmenter/JiebaSegmenter.cs
+++ b/Segmenter/JiebaSegmenter.cs
@@ -16,13 +16,14 @@ public class JiebaSegmenter
private static readonly IFinalSeg FinalSeg = Viterbi.Instance;
private static readonly ISet LoadedPath = new HashSet();
- private static readonly object locker = new object();
+ private static readonly object Locker = new object();
internal IDictionary UserWordTagTab { get; set; }
#region Regular Expressions
- internal static readonly Regex RegexChineseDefault = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", RegexOptions.Compiled);
+ internal static readonly Regex RegexChineseDefault =
+ new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", RegexOptions.Compiled);
internal static readonly Regex RegexSkipDefault = new Regex(@"(\r\n|\s)", RegexOptions.Compiled);
@@ -31,7 +32,8 @@ public class JiebaSegmenter
internal static readonly Regex RegexEnglishChars = new Regex(@"[a-zA-Z0-9]", RegexOptions.Compiled);
- internal static readonly Regex RegexUserDict = new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled);
+ internal static readonly Regex RegexUserDict =
+ new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled);
#endregion
@@ -52,7 +54,7 @@ public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true
{
var reHan = RegexChineseDefault;
var reSkip = RegexSkipDefault;
- Func> cutMethod = null;
+ Func> cutMethod;
if (cutAll)
{
@@ -61,17 +63,11 @@ public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true
}
if (cutAll)
- {
cutMethod = CutAll;
- }
else if (hmm)
- {
cutMethod = CutDag;
- }
else
- {
cutMethod = CutDagWithoutHmm;
- }
return CutIt(text, cutMethod, reHan, reSkip, cutAll);
}
@@ -84,28 +80,12 @@ public IEnumerable CutForSearch(string text, bool hmm = true)
foreach (var w in words)
{
if (w.Length > 2)
- {
- foreach (var i in Enumerable.Range(0, w.Length - 1))
- {
- var gram2 = w.Substring(i, 2);
- if (WordDict.ContainsWord(gram2))
- {
- result.Add(gram2);
- }
- }
- }
+ result.AddRange(Enumerable.Range(0, w.Length - 1).Select(i => w.Substring(i, 2))
+ .Where(gram2 => WordDict.ContainsWord(gram2)));
if (w.Length > 3)
- {
- foreach (var i in Enumerable.Range(0, w.Length - 2))
- {
- var gram3 = w.Substring(i, 3);
- if (WordDict.ContainsWord(gram3))
- {
- result.Add(gram3);
- }
- }
- }
+ result.AddRange(Enumerable.Range(0, w.Length - 2).Select(i => w.Substring(i, 3))
+ .Where(gram3 => WordDict.ContainsWord(gram3)));
result.Add(w);
}
@@ -138,20 +118,17 @@ public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMo
{
var gram2 = w.Substring(i, 2);
if (WordDict.ContainsWord(gram2))
- {
result.Add(new Token(gram2, start + i, start + i + 2));
- }
}
}
+
if (width > 3)
{
for (var i = 0; i < width - 2; i++)
{
var gram3 = w.Substring(i, 3);
if (WordDict.ContainsWord(gram3))
- {
result.Add(new Token(gram3, start + i, start + i + 3));
- }
}
}
@@ -170,31 +147,33 @@ internal IDictionary> GetDag(string sentence)
var dag = new Dictionary>();
var trie = WordDict.Trie;
- var N = sentence.Length;
+ var n = sentence.Length;
for (var k = 0; k < sentence.Length; k++)
{
- var templist = new List();
+ var tempList = new List();
var i = k;
var frag = sentence.Substring(k, 1);
- while (i < N && trie.ContainsKey(frag))
+ while (i < n && trie.ContainsKey(frag))
{
if (trie[frag] > 0)
{
- templist.Add(i);
+ tempList.Add(i);
}
i++;
// TODO:
- if (i < N)
+ if (i < n)
{
frag = sentence.Sub(k, i + 1);
}
}
- if (templist.Count == 0)
+
+ if (tempList.Count == 0)
{
- templist.Add(k);
+ tempList.Add(k);
}
- dag[k] = templist;
+
+ dag[k] = tempList;
}
return dag;
@@ -203,24 +182,24 @@ internal IDictionary> GetDag(string sentence)
internal IDictionary> Calc(string sentence, IDictionary> dag)
{
var n = sentence.Length;
- var route = new Dictionary>();
- route[n] = new Pair(0, 0.0);
+ var route = new Dictionary> {[n] = new Pair(0, 0.0)};
var logtotal = Math.Log(WordDict.Total);
for (var i = n - 1; i > -1; i--)
{
var candidate = new Pair(-1, double.MinValue);
- foreach (int x in dag[i])
+ foreach (var x in dag[i])
{
- var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal + route[x + 1].Freq;
- if (candidate.Freq < freq)
- {
- candidate.Freq = freq;
- candidate.Key = x;
- }
+ var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal +
+ route[x + 1].Freq;
+ if (!(candidate.Freq < freq)) continue;
+ candidate.Freq = freq;
+ candidate.Key = x;
}
+
route[i] = candidate;
}
+
return route;
}
@@ -244,11 +223,9 @@ internal IEnumerable CutAll(string sentence)
{
foreach (var j in nexts)
{
- if (j > k)
- {
- words.Add(sentence.Substring(k, j + 1 - k));
- lastPos = j;
- }
+ if (j <= k) continue;
+ words.Add(sentence.Substring(k, j + 1 - k));
+ lastPos = j;
}
}
}
@@ -281,8 +258,10 @@ internal IEnumerable CutDag(string sentence)
AddBufferToWordList(tokens, buf);
buf = string.Empty;
}
+
tokens.Add(w);
}
+
x = y;
}
@@ -302,17 +281,16 @@ internal IEnumerable CutDagWithoutHmm(string sentence)
var words = new List();
var x = 0;
- string buf = string.Empty;
- var N = sentence.Length;
+ var buf = string.Empty;
+ var sentenceLength = sentence.Length;
- var y = -1;
- while (x < N)
+ while (x < sentenceLength)
{
- y = route[x].Key + 1;
- var l_word = sentence.Substring(x, y - x);
- if (RegexEnglishChars.IsMatch(l_word) && l_word.Length == 1)
+ var y = route[x].Key + 1;
+ var lWord = sentence.Substring(x, y - x);
+ if (RegexEnglishChars.IsMatch(lWord) && lWord.Length == 1)
{
- buf += l_word;
+ buf += lWord;
x = y;
}
else
@@ -322,7 +300,8 @@ internal IEnumerable CutDagWithoutHmm(string sentence)
words.Add(buf);
buf = string.Empty;
}
- words.Add(l_word);
+
+ words.Add(lWord);
x = y;
}
}
@@ -336,7 +315,7 @@ internal IEnumerable CutDagWithoutHmm(string sentence)
}
internal IEnumerable CutIt(string text, Func> cutMethod,
- Regex reHan, Regex reSkip, bool cutAll)
+ Regex reHan, Regex reSkip, bool cutAll)
{
var result = new List();
var blocks = reHan.Split(text);
@@ -349,10 +328,7 @@ internal IEnumerable CutIt(string text, Func
if (reHan.IsMatch(blk))
{
- foreach (var word in cutMethod(blk))
- {
- result.Add(word);
- }
+ result.AddRange(cutMethod(blk));
}
else
{
@@ -360,20 +336,12 @@ internal IEnumerable CutIt(string text, Func
foreach (var x in tmp)
{
if (reSkip.IsMatch(x))
- {
result.Add(x);
- }
else if (!cutAll)
- {
foreach (var ch in x)
- {
result.Add(ch.ToString());
- }
- }
else
- {
result.Add(x);
- }
}
}
}
@@ -394,7 +362,7 @@ public void LoadUserDict(string userDictFile)
var dictFullPath = Path.GetFullPath(userDictFile);
Debug.WriteLine("Initializing user dictionary: " + userDictFile);
- lock (locker)
+ lock (Locker)
{
if (LoadedPath.Contains(dictFullPath))
return;
@@ -425,7 +393,7 @@ public void LoadUserDict(string userDictFile)
}
catch (IOException e)
{
- Debug.Fail(string.Format("'{0}' load failure, reason: {1}", dictFullPath, e.Message));
+ Debug.Fail($"'{dictFullPath}' load failure, reason: {e.Message}");
}
catch (FormatException fe)
{
@@ -440,6 +408,7 @@ public void AddWord(string word, int freq = 0, string tag = null)
{
freq = WordDict.SuggestFreq(word, Cut(word, hmm: false));
}
+
WordDict.AddWord(word, freq);
// Add user word tag of POS
diff --git a/Segmenter/Pair.cs b/Segmenter/Pair.cs
index 59d1d94..bc008d3 100644
--- a/Segmenter/Pair.cs
+++ b/Segmenter/Pair.cs
@@ -2,7 +2,7 @@
{
public class Pair
{
- public TKey Key { get;set; }
+ public TKey Key { get; set; }
public double Freq { get; set; }
public Pair(TKey key, double freq)
@@ -16,4 +16,4 @@ public override string ToString()
return "Candidate [Key=" + Key + ", Freq=" + Freq + "]";
}
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/PosSeg/Pair.cs b/Segmenter/PosSeg/Pair.cs
index 6eda19b..fc57b90 100644
--- a/Segmenter/PosSeg/Pair.cs
+++ b/Segmenter/PosSeg/Pair.cs
@@ -4,6 +4,7 @@ public class Pair
{
public string Word { get; set; }
public string Flag { get; set; }
+
public Pair(string word, string flag)
{
Word = word;
@@ -12,7 +13,7 @@ public Pair(string word, string flag)
public override string ToString()
{
- return string.Format("{0}/{1}", Word, Flag);
+ return $"{Word}/{Flag}";
}
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/PosSeg/PosSegmenter.cs b/Segmenter/PosSeg/PosSegmenter.cs
index 7afbe20..eed7402 100644
--- a/Segmenter/PosSeg/PosSegmenter.cs
+++ b/Segmenter/PosSeg/PosSegmenter.cs
@@ -18,7 +18,9 @@ public class PosSegmenter
#region Regular Expressions
- internal static readonly Regex RegexChineseInternal = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
+ internal static readonly Regex RegexChineseInternal =
+ new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
+
internal static readonly Regex RegexSkipInternal = new Regex(@"(\r\n|\s)", RegexOptions.Compiled);
internal static readonly Regex RegexChineseDetail = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
@@ -49,7 +51,7 @@ private static void LoadWordTagTab()
var tokens = line.Split(' ');
if (tokens.Length < 2)
{
- Debug.Fail(string.Format("Invalid line: {0}", line));
+ Debug.Fail($"Invalid line: {line}");
continue;
}
@@ -61,7 +63,7 @@ private static void LoadWordTagTab()
}
catch (System.IO.IOException e)
{
- Debug.Fail(string.Format("Word tag table load failure, reason: {0}", e.Message));
+ Debug.Fail($"Word tag table load failure, reason: {e.Message}");
}
catch (FormatException fe)
{
@@ -69,7 +71,7 @@ private static void LoadWordTagTab()
}
}
- private JiebaSegmenter _segmenter;
+ private readonly JiebaSegmenter _segmenter;
public PosSegmenter()
{
@@ -83,11 +85,9 @@ public PosSegmenter(JiebaSegmenter segmenter)
private void CheckNewUserWordTags()
{
- if (_segmenter.UserWordTagTab.IsNotEmpty())
- {
- _wordTagTab.Update(_segmenter.UserWordTagTab);
- _segmenter.UserWordTagTab = new Dictionary();
- }
+ if (!_segmenter.UserWordTagTab.IsNotEmpty()) return;
+ _wordTagTab.Update(_segmenter.UserWordTagTab);
+ _segmenter.UserWordTagTab = new Dictionary();
}
public IEnumerable Cut(string text, bool hmm = true)
@@ -104,13 +104,9 @@ internal IEnumerable CutInternal(string text, bool hmm = true)
var blocks = RegexChineseInternal.Split(text);
Func> cutMethod = null;
if (hmm)
- {
cutMethod = CutDag;
- }
else
- {
cutMethod = CutDagWithoutHmm;
- }
var tokens = new List();
foreach (var blk in blocks)
@@ -135,17 +131,11 @@ internal IEnumerable CutInternal(string text, bool hmm = true)
// TODO: each char?
var xxs = xx.ToString();
if (RegexNumbers.IsMatch(xxs))
- {
tokens.Add(new Pair(xxs, "m"));
- }
else if (RegexEnglishWords.IsMatch(x))
- {
tokens.Add(new Pair(xxs, "eng"));
- }
else
- {
tokens.Add(new Pair(xxs, "x"));
- }
}
}
}
@@ -170,9 +160,7 @@ internal IEnumerable CutDag(string sentence)
var y = route[x].Key + 1;
var w = sentence.Substring(x, y - x);
if (y - x == 1)
- {
buf += w;
- }
else
{
if (buf.Length > 0)
@@ -180,15 +168,15 @@ internal IEnumerable CutDag(string sentence)
AddBufferToWordList(tokens, buf);
buf = string.Empty;
}
+
tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x")));
}
+
x = y;
}
if (buf.Length > 0)
- {
AddBufferToWordList(tokens, buf);
- }
return tokens;
}
@@ -204,10 +192,9 @@ internal IEnumerable CutDagWithoutHmm(string sentence)
var buf = string.Empty;
var n = sentence.Length;
- var y = -1;
while (x < n)
{
- y = route[x].Key + 1;
+ var y = route[x].Key + 1;
var w = sentence.Substring(x, y - x);
// TODO: char or word?
if (RegexEnglishChar.IsMatch(w))
@@ -222,15 +209,14 @@ internal IEnumerable CutDagWithoutHmm(string sentence)
tokens.Add(new Pair(buf, "eng"));
buf = string.Empty;
}
+
tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x")));
x = y;
}
}
if (buf.Length > 0)
- {
tokens.Add(new Pair(buf, "eng"));
- }
return tokens;
}
@@ -250,21 +236,13 @@ internal IEnumerable CutDetail(string text)
var tmp = RegexSkipDetail.Split(blk);
foreach (var x in tmp)
{
- if (!string.IsNullOrWhiteSpace(x))
- {
- if (RegexNumbers.IsMatch(x))
- {
- tokens.Add(new Pair(x, "m"));
- }
- else if(RegexEnglishWords.IsMatch(x))
- {
- tokens.Add(new Pair(x, "eng"));
- }
- else
- {
- tokens.Add(new Pair(x, "x"));
- }
- }
+ if (string.IsNullOrWhiteSpace(x)) continue;
+ if (RegexNumbers.IsMatch(x))
+ tokens.Add(new Pair(x, "m"));
+ else if (RegexEnglishWords.IsMatch(x))
+ tokens.Add(new Pair(x, "eng"));
+ else
+ tokens.Add(new Pair(x, "x"));
}
}
}
diff --git a/Segmenter/PosSeg/Viterbi.cs b/Segmenter/PosSeg/Viterbi.cs
index 349815d..3e6301c 100644
--- a/Segmenter/PosSeg/Viterbi.cs
+++ b/Segmenter/PosSeg/Viterbi.cs
@@ -21,10 +21,7 @@ private Viterbi()
}
// TODO: synchronized
- public static Viterbi Instance
- {
- get { return Lazy.Value; }
- }
+ public static Viterbi Instance => Lazy.Value;
public IEnumerable Cut(string sentence)
{
@@ -38,24 +35,27 @@ public IEnumerable Cut(string sentence)
var parts = posList[i].Split('-');
var charState = parts[0][0];
var pos = parts[1];
- if (charState == 'B')
- begin = i;
- else if (charState == 'E')
- {
- tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos));
- next = i + 1;
- }
- else if (charState == 'S')
+ switch (charState)
{
- tokens.Add(new Pair(sentence.Sub(i, i + 1), pos));
- next = i + 1;
+ case 'B':
+ begin = i;
+ break;
+ case 'E':
+ tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos));
+ next = i + 1;
+ break;
+ case 'S':
+ tokens.Add(new Pair(sentence.Sub(i, i + 1), pos));
+ next = i + 1;
+ break;
}
}
+
if (next < sentence.Length)
{
tokens.Add(new Pair(sentence.Substring(next), posList[next].Split('-')[1]));
}
-
+
return tokens;
}
@@ -138,6 +138,7 @@ private Tuple> ViterbiCut(string sentence)
state = y0;
}
}
+
v[i][y] = prob;
memPath[i][y] = state;
}
@@ -150,8 +151,9 @@ private Tuple> ViterbiCut(string sentence)
foreach (var endPoint in last)
{
// TODO: compare two very small values;
- if (endProb < endPoint.Prob ||
- (endProb == endPoint.Prob && String.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0))
+ if (endProb < endPoint.Prob ||
+ (endProb == endPoint.Prob &&
+ string.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0))
{
endProb = endPoint.Prob;
endState = endPoint.State;
@@ -161,7 +163,7 @@ private Tuple> ViterbiCut(string sentence)
var route = new string[sentence.Length];
var n = sentence.Length - 1;
var curState = endState;
- while(n >= 0)
+ while (n >= 0)
{
route[n] = curState;
curState = memPath[n][curState];
diff --git a/Segmenter/Segmenter.csproj b/Segmenter/Segmenter.csproj
index 9a8b10a..0404426 100644
--- a/Segmenter/Segmenter.csproj
+++ b/Segmenter/Segmenter.csproj
@@ -55,8 +55,8 @@
-
-
+
+
\ No newline at end of file
diff --git a/Segmenter/Spelling/SpellChecker.cs b/Segmenter/Spelling/SpellChecker.cs
index 518f345..290f970 100644
--- a/Segmenter/Spelling/SpellChecker.cs
+++ b/Segmenter/Spelling/SpellChecker.cs
@@ -14,7 +14,7 @@ public class SpellChecker : ISpellChecker
internal static readonly WordDictionary WordDict = WordDictionary.Instance;
internal readonly Trie WordTrie;
- internal readonly Dictionary> FirstChars;
+ internal readonly Dictionary> FirstChars;
public SpellChecker()
{
@@ -24,21 +24,16 @@ public SpellChecker()
foreach (var wd in wordDict.Trie)
{
- if (wd.Value > 0)
- {
- WordTrie.Insert(wd.Key, wd.Value);
-
- if (wd.Key.Length >= 2)
- {
- var second = wd.Key[1];
- var first = wd.Key[0];
- if (!FirstChars.ContainsKey(second))
- {
- FirstChars[second] = new HashSet();
- }
- FirstChars[second].Add(first);
- }
- }
+ if (wd.Value <= 0) continue;
+ WordTrie.Insert(wd.Key, wd.Value);
+
+ if (wd.Key.Length < 2) continue;
+ var second = wd.Key[1];
+ var first = wd.Key[0];
+ if (!FirstChars.ContainsKey(second))
+ FirstChars[second] = new HashSet();
+
+ FirstChars[second].Add(first);
}
}
@@ -46,9 +41,7 @@ internal ISet GetEdits1(string word)
{
var splits = new List();
for (var i = 0; i <= word.Length; i++)
- {
- splits.Add(new WordSplit() { Left = word.Substring(0, i), Right = word.Substring(i) });
- }
+ splits.Add(new WordSplit() {Left = word.Substring(0, i), Right = word.Substring(i)});
var deletes = splits
.Where(s => !string.IsNullOrEmpty(s.Right))
@@ -63,20 +56,15 @@ internal ISet GetEdits1(string word)
{
var firsts = FirstChars[word[1]];
foreach (var first in firsts)
- {
if (first != word[0])
- {
replaces.Add(first + word.Substring(1));
- }
- }
var node = WordTrie.Root.Children[word[0]];
- for (int i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
+ for (var i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
{
foreach (var c in node.Children.Keys)
- {
replaces.Add(word.Substring(0, i) + c + word.Substring(i + 1));
- }
+
node = node.Children.GetValueOrDefault(word[i]);
}
}
@@ -88,23 +76,17 @@ internal ISet GetEdits1(string word)
{
var firsts = FirstChars[word[0]];
foreach (var first in firsts)
- {
inserts.Add(first + word);
- }
}
var node = WordTrie.Root.Children.GetValueOrDefault(word[0]);
- for (int i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
+ for (var i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
{
foreach (var c in node.Children.Keys)
- {
- inserts.Add(word.Substring(0, i+1) + c + word.Substring(i+1));
- }
+ inserts.Add(word.Substring(0, i + 1) + c + word.Substring(i + 1));
if (i < word.Length - 1)
- {
node = node.Children.GetValueOrDefault(word[i + 1]);
- }
}
}
@@ -121,9 +103,8 @@ internal ISet GetKnownEdits2(string word)
{
var result = new HashSet();
foreach (var e1 in GetEdits1(word))
- {
result.UnionWith(GetEdits1(e1).Where(e => WordDictionary.Instance.ContainsWord(e)));
- }
+
return result;
}
@@ -135,16 +116,12 @@ internal ISet GetKnownWords(IEnumerable words)
public IEnumerable Suggests(string word)
{
if (WordDict.ContainsWord(word))
- {
return new[] {word};
- }
var candicates = GetKnownWords(GetEdits1(word));
if (candicates.IsNotEmpty())
- {
return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c));
- }
-
+
candicates.UnionWith(GetKnownEdits2(word));
return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c));
}
@@ -155,4 +132,4 @@ internal class WordSplit
public string Left { get; set; }
public string Right { get; set; }
}
-}
+}
\ No newline at end of file
diff --git a/Segmenter/Token.cs b/Segmenter/Token.cs
index 1083027..5e2ea67 100644
--- a/Segmenter/Token.cs
+++ b/Segmenter/Token.cs
@@ -15,7 +15,7 @@ public Token(string word, int startIndex, int endIndex)
public override string ToString()
{
- return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex);
+ return $"[{Word}, ({StartIndex}, {EndIndex})]";
}
}
}
\ No newline at end of file
diff --git a/Segmenter/WordDictionary.cs b/Segmenter/WordDictionary.cs
index 553e200..45b2466 100644
--- a/Segmenter/WordDictionary.cs
+++ b/Segmenter/WordDictionary.cs
@@ -15,7 +15,7 @@ public class WordDictionary
private static readonly Lazy lazy = new Lazy(() => new WordDictionary());
private static readonly string MainDict = ConfigManager.MainDictFile;
- internal IDictionary Trie = new Dictionary();
+ internal readonly IDictionary Trie = new Dictionary();
///
/// total occurrence of all words.
@@ -30,10 +30,7 @@ private WordDictionary()
Debug.WriteLine("total freq: {0}", Total);
}
- public static WordDictionary Instance
- {
- get { return lazy.Value; }
- }
+ public static WordDictionary Instance => lazy.Value;
private void LoadDict()
{
@@ -52,7 +49,7 @@ private void LoadDict()
var tokens = line.Split(' ');
if (tokens.Length < 2)
{
- Debug.Fail(string.Format("Invalid line: {0}", line));
+ Debug.Fail($"Invalid line: {line}");
continue;
}
@@ -78,7 +75,7 @@ private void LoadDict()
}
catch (IOException e)
{
- Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message));
+ Debug.Fail($"{MainDict} load failure, reason: {e.Message}");
}
catch (FormatException fe)
{
@@ -93,18 +90,13 @@ public bool ContainsWord(string word)
public int GetFreqOrDefault(string key)
{
- if (ContainsWord(key))
- return Trie[key];
- else
- return 1;
+ return ContainsWord(key) ? Trie[key] : 1;
}
public void AddWord(string word, int freq, string tag = null)
{
if (ContainsWord(word))
- {
Total -= Trie[word];
- }
Trie[word] = freq;
Total += freq;
@@ -125,13 +117,10 @@ public void DeleteWord(string word)
internal int SuggestFreq(string word, IEnumerable segments)
{
- double freq = 1;
- foreach (var seg in segments)
- {
- freq *= GetFreqOrDefault(seg) / Total;
- }
+ var freq = segments.Aggregate(1,
+ (current, seg) => current * (GetFreqOrDefault(seg) / Total));
- return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word));
+ return Math.Max((int) (freq * Total) + 1, GetFreqOrDefault(word));
}
}
}
\ No newline at end of file
diff --git a/TestProject/TestProject.csproj b/TestProject/TestProject.csproj
new file mode 100644
index 0000000..6c6743f
--- /dev/null
+++ b/TestProject/TestProject.csproj
@@ -0,0 +1,19 @@
+
+
+
+ netcoreapp2.2
+
+ false
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/TestProject/UnitTest1.cs b/TestProject/UnitTest1.cs
new file mode 100644
index 0000000..3b8fe04
--- /dev/null
+++ b/TestProject/UnitTest1.cs
@@ -0,0 +1,171 @@
+using System;
+using System.Linq;
+using JiebaNet.Analyser;
+using JiebaNet.Segmenter;
+using JiebaNet.Segmenter.Common;
+using JiebaNet.Segmenter.PosSeg;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace TestProject
+{
+ public class UnitTest1
+ {
+ private readonly ITestOutputHelper _testOutputHelper;
+
+ public UnitTest1(ITestOutputHelper testOutputHelper)
+ {
+ _testOutputHelper = testOutputHelper;
+ }
+
+ [Fact]
+ public void Test1()
+ {
+ var segmenter = new JiebaSegmenter();
+ segmenter.LoadUserDict("userdict.txt");
+ var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
+ Assert.Equal("【全模式】:我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学", $"【全模式】:{string.Join("/ ", segments)}");
+ segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式
+ Assert.Equal("【精确模式】:我/ 来到/ 北京/ 清华大学", $"【精确模式】:{string.Join("/ ", segments)}");
+
+ segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
+ Assert.Equal("【新词识别】:他/ 来到/ 了/ 网易/ 杭研/ 大厦", $"【新词识别】:{string.Join("/ ", segments)}");
+
+ segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
+ Assert.Equal("【搜索引擎模式】:小明/ 硕士/ 毕业/ 于/ 中国/ 科学/ 学院/ 科学院/ 中国科学院/ 计算/ 计算所/ ,/ 后/ 在/ 日本/ 京都/ 大学/ 日本京都大学/ 深造",
+ $"【搜索引擎模式】:{string.Join("/ ", segments)}");
+
+ segments = segmenter.Cut("结过婚的和尚未结过婚的");
+ Assert.Equal("【歧义消除】:结过婚/ 的/ 和/ 尚未/ 结过婚/ 的", $"【歧义消除】:{string.Join("/ ", segments)}");
+
+ segments = segmenter.Cut("linezerodemo机器学习学习机器");
+ Assert.Equal("【用户字典】:linezero/ demo/ 机器学习/ 学习/ 机器", $"【用户字典】:{string.Join("/ ", segments)}");
+ }
+
+ [Fact]
+ public void Test2()
+ {
+ var segmenter = new JiebaSegmenter();
+ //词频统计
+ var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
+ var freqs = new Counter(segmenter.Cut(s));
+ foreach (var (key, value) in freqs.MostCommon(5))
+ {
+ _testOutputHelper.WriteLine($"{key}: {value}");
+ }
+ }
+
+ [Fact]
+ public void CutDemo()
+ {
+ var segmenter = new JiebaSegmenter();
+ var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
+ Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式
+ Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
+ Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
+ Console.WriteLine("【搜索引擎模式】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("结过婚的和尚未结过婚的");
+ Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("北京大学生喝进口红酒");
+ Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("在北京大学生活区喝进口红酒");
+ Console.WriteLine("【歧义消除】:{0}", string.Join("/ ", segments));
+
+ segments = segmenter.Cut("腾讯视频致力于打造中国最大的在线视频媒体平台,以丰富的内容、极致的观看体验");
+ Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
+
+ segmenter.DeleteWord("湖南");
+ segmenter.AddWord("湖南");
+ //segmenter.AddWord("长沙市");
+ segments = segmenter.Cut("湖南长沙市天心区");
+ Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
+ }
+
+ [Fact]
+ public void TokenizeDemo()
+ {
+ var segmenter = new JiebaSegmenter();
+ var s = "永和服装饰品有限公司";
+ var tokens = segmenter.Tokenize(s);
+ foreach (var token in tokens)
+ {
+ Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex,
+ token.EndIndex);
+ }
+ }
+
+ [Fact]
+ public void TokenizeSearchDemo()
+ {
+ var segmenter = new JiebaSegmenter();
+ var s = "永和服装饰品有限公司";
+ var tokens = segmenter.Tokenize(s, TokenizerMode.Search);
+ foreach (var token in tokens)
+ {
+ Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex,
+ token.EndIndex);
+ }
+ }
+
+ [Fact]
+ public void PosCutDemo()
+ {
+ var posSeg = new PosSegmenter();
+ var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移";
+
+ var tokens = posSeg.Cut(s);
+ Console.WriteLine(
+ string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))));
+ }
+
+ [Fact]
+ public void ExtractTagsDemo()
+ {
+ var text =
+ "程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。";
+ var extractor = new TfidfExtractor();
+ var keywords = extractor.ExtractTags(text);
+ foreach (var keyword in keywords)
+ {
+ Console.WriteLine(keyword);
+ }
+ }
+
+ [Fact]
+ public void ExtractTagsDemo2()
+ {
+ var text =
+ @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。
+ 算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。
+ 形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。";
+
+ var extractor = new TfidfExtractor();
+ var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos);
+ foreach (var keyword in keywords)
+ {
+ Console.WriteLine(keyword);
+ }
+ }
+
+ [Fact]
+ public void TestWordFreq()
+ {
+ var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
+ var seg = new JiebaSegmenter();
+ var freqs = new Counter(seg.Cut(s));
+ foreach (var pair in freqs.MostCommon())
+ {
+ Console.WriteLine($"{pair.Key}: {pair.Value}");
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/jieba.NET/Program.cs b/jieba.NET/Program.cs
index 3b6c0a5..4d6562a 100644
--- a/jieba.NET/Program.cs
+++ b/jieba.NET/Program.cs
@@ -32,9 +32,9 @@ static void Main(string[] args)
//词频统计
var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
var freqs = new Counter(segmenter.Cut(s));
- foreach (var pair in freqs.MostCommon(5))
+ foreach (var (key, value) in freqs.MostCommon(5))
{
- Console.WriteLine($"{pair.Key}: {pair.Value}");
+ Console.WriteLine($"{key}: {value}");
}
//new TestDemo().CutDemo();
diff --git a/jieba.NET/TestDemo.cs b/jieba.NET/TestDemo.cs
index 7fd49a7..b2abc20 100644
--- a/jieba.NET/TestDemo.cs
+++ b/jieba.NET/TestDemo.cs
@@ -1,4 +1,5 @@
using System;
+using System.Collections.Generic;
using System.Linq;
using JiebaNet.Analyser;
using JiebaNet.Segmenter.PosSeg;
@@ -16,10 +17,10 @@ public void CutDemo()
var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
Console.WriteLine("【全模式】:{0}", string.Join("/ ", segments));
- segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式
+ segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式
Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments));
- segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
+ segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
Console.WriteLine("【新词识别】:{0}", string.Join("/ ", segments));
segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
@@ -58,7 +59,8 @@ public void TokenizeDemo()
var tokens = segmenter.Tokenize(s);
foreach (var token in tokens)
{
- Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex);
+ Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex,
+ token.EndIndex);
}
}
@@ -70,7 +72,8 @@ public void TokenizeSearchDemo()
var tokens = segmenter.Tokenize(s, TokenizerMode.Search);
foreach (var token in tokens)
{
- Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex, token.EndIndex);
+ Console.WriteLine("word {0,-12} start: {1,-3} end: {2,-3}", token.Word, token.StartIndex,
+ token.EndIndex);
}
}
@@ -81,7 +84,8 @@ public void PosCutDemo()
var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移";
var tokens = posSeg.Cut(s);
- Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))));
+ Console.WriteLine(
+ string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))));
}
@@ -89,7 +93,7 @@ public void ExtractTagsDemo()
{
var text =
"程序员(英文Programmer)是从事程序开发、维护的专业人员。一般将程序员分为程序设计人员和程序编码人员,但两者的界限并不非常清楚,特别是在中国。软件从业人员分为初级程序员、高级程序员、系统分析员和项目经理四大类。";
- var extractor = new TfidfExtractor();
+ var extractor = new TfidfExtractor(new HashSet(), new Dictionary());
var keywords = extractor.ExtractTags(text);
foreach (var keyword in keywords)
{
@@ -100,11 +104,12 @@ public void ExtractTagsDemo()
public void ExtractTagsDemo2()
{
- var text = @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。
+ var text =
+ @"在数学和计算机科学/算学之中,算法/算则法(Algorithm)为一个计算的具体步骤,常用于计算、数据处理和自动推理。精确而言,算法是一个表示为有限长列表的有效方法。算法应包含清晰定义的指令用于计算函数。
算法中的指令描述的是一个计算,当其运行时能从一个初始状态和初始输入(可能为空)开始,经过一系列有限而清晰定义的状态最终产生输出并停止于一个终态。一个状态到另一个状态的转移不一定是确定的。随机化算法在内的一些算法,包含了一些随机输入。
形式化算法的概念部分源自尝试解决希尔伯特提出的判定问题,并在其后尝试定义有效计算性或者有效方法中成形。这些尝试包括库尔特·哥德尔、雅克·埃尔布朗和斯蒂芬·科尔·克莱尼分别于1930年、1934年和1935年提出的递归函数,阿隆佐·邱奇于1936年提出的λ演算,1936年Emil Leon Post的Formulation 1和艾伦·图灵1937年提出的图灵机。即使在当前,依然常有直觉想法难以定义为形式化算法的情况。";
- var extractor = new TfidfExtractor();
+ var extractor = new TfidfExtractor(new HashSet(), new Dictionary());
var keywords = extractor.ExtractTags(text, 10, Constants.NounAndVerbPos);
foreach (var keyword in keywords)
{
@@ -117,9 +122,9 @@ public void TestWordFreq()
var s = "此领域探讨如何处理及运用自然语言。自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。";
var seg = new JiebaSegmenter();
var freqs = new Counter(seg.Cut(s));
- foreach (var pair in freqs.MostCommon())
+ foreach (var (key, value) in freqs.MostCommon())
{
- Console.WriteLine($"{pair.Key}: {pair.Value}");
+ Console.WriteLine($"{key}: {value}");
}
}
}