Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions Analyser/Analyser.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
</PropertyGroup>

<ItemGroup>
<None Remove="Resources\idf.txt" />
<None Remove="Resources\stopwords.txt" />
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="Resources\idf.txt">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Resources\stopwords.txt">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Segmenter\Segmenter.csproj" />
</ItemGroup>
Expand Down
19 changes: 2 additions & 17 deletions Analyser/ConfigManager.cs
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
using System.IO;
using System;

namespace JiebaNet.Analyser
{
public class ConfigManager
{
// TODO: duplicate codes.
public static string ConfigFileBaseDir
{
get
{
return "Resources";
}
}
public static string ConfigFileBaseDir => "Resources";

public static string IdfFile
{
get { return Path.Combine(ConfigFileBaseDir, "idf.txt"); }
}

public static string StopWordsFile
{
get { return Path.Combine(ConfigFileBaseDir, "stopwords.txt"); }
}
public static string IdfFile => Path.Combine(ConfigFileBaseDir, "idf.txt");
}
}
34 changes: 6 additions & 28 deletions Analyser/IdfLoader.cs
Original file line number Diff line number Diff line change
@@ -1,47 +1,25 @@
using JiebaNet.Segmenter.Common;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;

namespace JiebaNet.Analyser
{
public class IdfLoader
{
internal string IdfFilePath { get; set; }
internal IDictionary<string, double> IdfFreq { get; set; }
internal double MedianIdf { get; set; }

public IdfLoader(string idfPath = null)
public IdfLoader(IDictionary<string, double> idfFreq)
{
IdfFilePath = string.Empty;
IdfFreq = new Dictionary<string, double>();
MedianIdf = 0.0;
if (!string.IsNullOrWhiteSpace(idfPath))
{
SetNewPath(idfPath);
}
IdfFreq = idfFreq ?? new Dictionary<string, double>();
MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}

public void SetNewPath(string newIdfPath)
public void SetNewPath(IDictionary<string, double> newIdfFreq)
{
var idfPath = newIdfPath;
if (IdfFilePath != idfPath)
{
IdfFilePath = idfPath;
var lines = FileExtension.ReadEmbeddedAllLines(idfPath, Encoding.UTF8);
IdfFreq = new Dictionary<string, double>();
foreach (var line in lines)
{
var parts = line.Trim().Split(' ');
var word = parts[0];
var freq = double.Parse(parts[1]);
IdfFreq[word] = freq;
}

MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}
IdfFreq = IdfFreq?.Union(newIdfFreq).ToDictionary(k => k.Key, v => v.Value) ?? newIdfFreq;
MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}
}
}
28 changes: 9 additions & 19 deletions Analyser/KeywordExtractor.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
using JiebaNet.Segmenter.Common;
using Microsoft.Extensions.FileProviders;
using System.Collections.Generic;
using System.IO;
using System.Collections.Generic;

namespace JiebaNet.Analyser
{
Expand All @@ -14,35 +11,28 @@ public abstract class KeywordExtractor
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
};

protected virtual ISet<string> StopWords { get; set; }

public void SetStopWords(string stopWordsFile)
protected ISet<string> StopWords { get; set; }
public void SetStopWords(ISet<string> stopWords)
{
StopWords = new HashSet<string>();
var lines = FileExtension.ReadEmbeddedAllLines(stopWordsFile);
foreach (var line in lines)
{
StopWords.Add(line.Trim());
}
StopWords = stopWords ?? new HashSet<string>();
}

public void AddStopWord(string word)
{
if (!StopWords.Contains(word))
{
StopWords.Add(word.Trim());
}
}

public void AddStopWords(IEnumerable<string> words)
{
foreach (var word in words)
{
AddStopWord(word);
}
}

public abstract IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null);
public abstract IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null);
public abstract IEnumerable<string> ExtractTags(string text, int count = 20,
IEnumerable<string> allowPos = null);

public abstract IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20,
IEnumerable<string> allowPos = null);
}
}
60 changes: 29 additions & 31 deletions Analyser/TextRankExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,39 @@ public bool PairFilter(IEnumerable<string> allowPos, Pair wp)
&& !StopWords.Contains(wp.Word.ToLower());
}

public TextRankExtractor()
public TextRankExtractor(ISet<string> stopWords)
{
Span = 5;

Segmenter = new JiebaSegmenter();
PosSegmenter = new PosSegmenter(Segmenter);
SetStopWords(ConfigManager.StopWordsFile);
SetStopWords(stopWords);
if (StopWords.IsEmpty())
{
StopWords.UnionWith(DefaultStopWords);
}
}

public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null)
public override IEnumerable<string> ExtractTags(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0) { count = 20; }
if (count <= 0)
{
count = 20;
}

return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}

public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null)
public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0) { count = 20; }
return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
if (count <= 0)
{
count = 20;
}

return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair
{
Word = p.Key, Weight = p.Value
}).Take(count);
Expand All @@ -60,9 +68,7 @@ public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, i
private IDictionary<string, double> ExtractTagRank(string text, IEnumerable<string> allowPos)
{
if (allowPos.IsEmpty())
{
allowPos = DefaultPosFilter;
}

var g = new UndirectWeightedGraph();
var cm = new Dictionary<string, int>();
Expand All @@ -71,27 +77,19 @@ private IDictionary<string, double> ExtractTagRank(string text, IEnumerable<stri
for (var i = 0; i < words.Count(); i++)
{
var wp = words[i];
if (PairFilter(allowPos, wp))
if (!PairFilter(allowPos, wp)) continue;
for (var j = i + 1; j < i + Span; j++)
{
for (var j = i + 1; j < i + Span; j++)
{
if (j >= words.Count)
{
break;
}
if (!PairFilter(allowPos, words[j]))
{
continue;
}

// TODO: better separator.
var key = wp.Word + "$" + words[j].Word;
if (!cm.ContainsKey(key))
{
cm[key] = 0;
}
cm[key] += 1;
}
if (j >= words.Count)
break;
if (!PairFilter(allowPos, words[j]))
continue;

// TODO: better separator.
var key = wp.Word + "$" + words[j].Word;
if (!cm.ContainsKey(key))
cm[key] = 0;
cm[key] += 1;
}
}

Expand Down
55 changes: 20 additions & 35 deletions Analyser/TfidfExtractor.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter;
using JiebaNet.Segmenter.Common;
Expand All @@ -10,7 +9,7 @@ namespace JiebaNet.Analyser
public class TfidfExtractor : KeywordExtractor
{
private static readonly string DefaultIdfFile = ConfigManager.IdfFile;
private static readonly int DefaultWordCount = 20;
private const int DefaultWordCount = 20;

private JiebaSegmenter Segmenter { get; set; }
private PosSegmenter PosSegmenter { get; set; }
Expand All @@ -19,32 +18,24 @@ public class TfidfExtractor : KeywordExtractor
private IDictionary<string, double> IdfFreq { get; set; }
private double MedianIdf { get; set; }

public TfidfExtractor(JiebaSegmenter segmenter = null)
public TfidfExtractor(ISet<string> stopWords, IDictionary<string, double> idfFreq,
JiebaSegmenter segmenter = null)
{
if (segmenter.IsNull())
{
Segmenter = new JiebaSegmenter();
}
else
{
Segmenter = segmenter;
}
Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter;
PosSegmenter = new PosSegmenter(Segmenter);
SetStopWords(ConfigManager.StopWordsFile);
SetStopWords(stopWords);
if (StopWords.IsEmpty())
{
StopWords.UnionWith(DefaultStopWords);
}

Loader = new IdfLoader(DefaultIdfFile);
Loader = new IdfLoader(idfFreq);

IdfFreq = Loader.IdfFreq;
MedianIdf = Loader.MedianIdf;
}

public void SetIdfPath(string idfPath)
public void SetIdfPath(IDictionary<string, double> idfFreq)
{
Loader.SetNewPath(idfPath);
Loader.SetNewPath(idfFreq);
IdfFreq = Loader.IdfFreq;
MedianIdf = Loader.MedianIdf;
}
Expand All @@ -58,46 +49,40 @@ private IEnumerable<string> FilterCutByPos(string text, IEnumerable<string> allo
private IDictionary<string, double> GetWordIfidf(string text, IEnumerable<string> allowPos)
{
IEnumerable<string> words = null;
if (allowPos.IsNotEmpty())
{
words = FilterCutByPos(text, allowPos);
}
else
{
words = Segmenter.Cut(text);
}
words = allowPos.IsNotEmpty() ? FilterCutByPos(text, allowPos) : Segmenter.Cut(text);

// Calculate TF
var freq = new Dictionary<string, double>();
foreach (var word in words)
{
var w = word;
if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower()))
{
continue;
}
freq[w] = freq.GetDefault(w, 0.0) + 1.0;
}

var total = freq.Values.Sum();
foreach (var k in freq.Keys.ToList())
{
freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total;
}

return freq;
}

public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null)
public override IEnumerable<string> ExtractTags(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
if (count <= 0) { count = DefaultWordCount; }
if (count <= 0)
count = DefaultWordCount;

var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}

public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null)
public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
if (count <= 0) { count = DefaultWordCount; }
if (count <= 0)
count = DefaultWordCount;

var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
Expand All @@ -112,4 +97,4 @@ public class WordWeightPair
public string Word { get; set; }
public double Weight { get; set; }
}
}
}
Loading