Skip to content

Commit 8cdb794

Browse files
committed
Prepare structure for the RTF parser
1 parent a8030de commit 8cdb794

File tree

10 files changed

+158
-7
lines changed

10 files changed

+158
-7
lines changed

src/DocSharp.Docx/DocxToRtf/DocxToRtfConverter.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public partial class DocxToRtfConverter : DocxToStringWriterBase<RtfStringWriter
2828
/// and escape other chars as '\xx (using code pages for chars 128-255)
2929
/// or as \uc1\u{X} (using Unicode).
3030
/// If an old reader does not support Unicode it will skip these characters rather than corrupting the document.
31+
/// This property only affects how the RTF file itself is written,
32+
/// and does not change how charactes are encoded in RTF text tokens.
3133
/// </summary>
3234
public override Encoding DefaultEncoding => Encoding.ASCII;
3335

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
namespace DocSharp.Rtf;
2+
3+
internal class RtfControlWord(string name) : RtfToken
4+
{
5+
public string Name { get; set; } = name;
6+
}
7+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
namespace DocSharp.Rtf;
2+
3+
internal class RtfControlWordWithValue<T>(string name, T value) : RtfToken
4+
{
5+
public string Name { get; set; } = name;
6+
7+
public T Value { get; set; } = value;
8+
9+
}
10+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace DocSharp.Rtf;
2+
3+
internal class RtfDestination(string name) : RtfGroup(name)
4+
{
5+
}
6+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
using System.Collections.Generic;
2+
3+
namespace DocSharp.Rtf;
4+
5+
internal class RtfDocument
6+
{
7+
internal IEnumerable<RtfToken> Tokens = [];
8+
}
9+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using System.Collections.Generic;
2+
3+
namespace DocSharp.Rtf;
4+
5+
internal class RtfGroup : RtfControlWord
6+
{
7+
internal IEnumerable<RtfToken> Tokens;
8+
9+
public RtfGroup(string name) : base(name)
10+
{
11+
Tokens = [];
12+
}
13+
}
14+
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
using System.IO;
2+
using System.Text;
3+
4+
namespace DocSharp.Rtf;
5+
6+
internal static class RtfReader
7+
{
8+
#if !NETFRAMEWORK
9+
static RtfReader()
10+
{
11+
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
12+
}
13+
#endif
14+
15+
public static RtfDocument ReadRtf(TextReader reader)
16+
{
17+
var rtf = new RtfDocument();
18+
19+
int i;
20+
char currentChar = default;
21+
char previousChar = default;
22+
while ((i = reader.Read()) != -1)
23+
{
24+
previousChar = currentChar;
25+
currentChar = (char)i;
26+
27+
switch (currentChar)
28+
{
29+
case '{':
30+
break;
31+
case '}':
32+
break;
33+
case '\\':
34+
break;
35+
case '*':
36+
break;
37+
default:
38+
if (char.IsDigit(currentChar))
39+
{
40+
41+
}
42+
else if (IsEnglishLetter(currentChar))
43+
{
44+
45+
}
46+
break;
47+
}
48+
}
49+
50+
return rtf;
51+
}
52+
53+
private static bool IsEnglishLetter(char c)
54+
{
55+
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
56+
}
57+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace DocSharp.Rtf;
2+
3+
internal class RtfText(string text) : RtfToken
4+
{
5+
public string Text { get; set; } = text;
6+
7+
public RtfText() : this(string.Empty) { }
8+
}
9+

src/DocSharp.Docx/RtfToDocx/RtfToDocxConverter.cs

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,39 @@
1212
using System.Xml;
1313
using System.Diagnostics;
1414
using DocumentFormat.OpenXml;
15+
using DocSharp.Rtf;
1516

1617
namespace DocSharp.Docx;
1718

18-
internal class RtfToDocxConverter : ITextToDocxConverter
19+
public class RtfToDocxConverter : ITextToDocxConverter
1920
{
21+
/// <summary>
22+
/// RTF files typically use ASCII (chars 0-127) and escape other chars using
23+
/// code pages (e.g. \'e0 for "à") or Unicode (e.g. \u915 for Γ).
24+
/// Code pages specify chars 128-255, depend on the system region and are also called "ANSI".
25+
/// Unicode can encode many more characters and is often called "non-ANSI" in the RTF specification.
26+
/// The code page is specified by the \ansi (default), \mac, \pc or \pca control in the RTF header,
27+
/// optionally followed by \ansicpgN. For example \ansicpg1252 indicates Windows-1252 and is used by U.s. Windows
28+
/// (extends ASCII with other letters and symbols related to the english alphabet).
29+
/// If the code page is not specified, ANSI based on the system region is assumed.
30+
/// Despite this, it's possible the some old RTF files use non-ASCII-based code pages,
31+
/// or that some RTF writers directly write non-ASCII letters such as à into text tokens,
32+
/// although it's not standard.
33+
/// Therefore, the DefaultEncoding property exist, it can be set by passing a different inputEncoding parameter
34+
/// to the Convert methods; alternatively a TextReader initialized with the correct encoding can be directly passed.
35+
/// Libraries such as https://github.com/CharsetDetector/UTF-unknown can be used to detect uncommon encodings;
36+
/// they require the stream to be seekable, so DocSharp is not using this approach by default.
37+
/// Note: the DefaultEncoding property only affects how the raw RTF file is read
38+
/// (in particular the RTF header and control words), it does not change how text tokens are handled:
39+
/// special characters such as \'xx are still interpreted based on the code page detected by RtfReader.
40+
/// </summary>
41+
public Encoding DefaultEncoding => Encoding.ASCII;
42+
2043
/// <summary>
2144
/// Populate the target DOCX document with converted RTF content.
2245
/// </summary>
2346
/// <param name="input"></param>
2447
/// <param name="targetDocument"></param>
25-
/// <exception cref="NotImplementedException"></exception>
2648
public void BuildDocx(TextReader input, WordprocessingDocument targetDocument)
2749
{
2850
if (targetDocument.MainDocumentPart == null)
@@ -31,9 +53,18 @@ public void BuildDocx(TextReader input, WordprocessingDocument targetDocument)
3153
if (targetDocument.MainDocumentPart!.Document == null)
3254
targetDocument.MainDocumentPart.Document = new Document();
3355

34-
// TODO
35-
// var rtfDocument = ParseRtf(input);
36-
// InsertRtf(rtfDocument, targetDocument);
37-
throw new NotImplementedException();
56+
var rtfDocument = RtfReader.ReadRtf(input);
57+
foreach (var token in rtfDocument.Tokens)
58+
{
59+
// TODO
60+
}
61+
}
62+
63+
64+
#if !NETFRAMEWORK
65+
static RtfToDocxConverter()
66+
{
67+
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
3868
}
39-
}
69+
#endif
70+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
namespace DocSharp.Rtf;
2+
3+
internal abstract class RtfToken
4+
{
5+
}
6+

0 commit comments

Comments
 (0)