1212using System . Xml ;
1313using System . Diagnostics ;
1414using DocumentFormat . OpenXml ;
15+ using DocSharp . Rtf ;
1516
1617namespace DocSharp . Docx ;
1718
18- internal class RtfToDocxConverter : ITextToDocxConverter
19+ public class RtfToDocxConverter : ITextToDocxConverter
1920{
21+ /// <summary>
22+ /// RTF files typically use ASCII (chars 0-127) and escape other chars using
23+ /// code pages (e.g. \'e0 for "à") or Unicode (e.g. \u915 for Γ).
24+ /// Code pages specify chars 128-255, depend on the system region and are also called "ANSI".
25+ /// Unicode can encode many more characters and is often called "non-ANSI" in the RTF specification.
26+ /// The code page is specified by the \ansi (default), \mac, \pc or \pca control in the RTF header,
27+ /// optionally followed by \ansicpgN. For example \ansicpg1252 indicates Windows-1252 and is used by U.s. Windows
28+ /// (extends ASCII with other letters and symbols related to the english alphabet).
29+ /// If the code page is not specified, ANSI based on the system region is assumed.
30+ /// Despite this, it's possible the some old RTF files use non-ASCII-based code pages,
31+ /// or that some RTF writers directly write non-ASCII letters such as à into text tokens,
32+ /// although it's not standard.
33+ /// Therefore, the DefaultEncoding property exist, it can be set by passing a different inputEncoding parameter
34+ /// to the Convert methods; alternatively a TextReader initialized with the correct encoding can be directly passed.
35+ /// Libraries such as https://github.com/CharsetDetector/UTF-unknown can be used to detect uncommon encodings;
36+ /// they require the stream to be seekable, so DocSharp is not using this approach by default.
37+ /// Note: the DefaultEncoding property only affects how the raw RTF file is read
38+ /// (in particular the RTF header and control words), it does not change how text tokens are handled:
39+ /// special characters such as \'xx are still interpreted based on the code page detected by RtfReader.
40+ /// </summary>
41+ public Encoding DefaultEncoding => Encoding . ASCII ;
42+
2043 /// <summary>
2144 /// Populate the target DOCX document with converted RTF content.
2245 /// </summary>
2346 /// <param name="input"></param>
2447 /// <param name="targetDocument"></param>
25- /// <exception cref="NotImplementedException"></exception>
2648 public void BuildDocx ( TextReader input , WordprocessingDocument targetDocument )
2749 {
2850 if ( targetDocument . MainDocumentPart == null )
@@ -31,9 +53,18 @@ public void BuildDocx(TextReader input, WordprocessingDocument targetDocument)
3153 if ( targetDocument . MainDocumentPart ! . Document == null )
3254 targetDocument . MainDocumentPart . Document = new Document ( ) ;
3355
34- // TODO
35- // var rtfDocument = ParseRtf(input);
36- // InsertRtf(rtfDocument, targetDocument);
37- throw new NotImplementedException ( ) ;
56+ var rtfDocument = RtfReader . ReadRtf ( input ) ;
57+ foreach ( var token in rtfDocument . Tokens )
58+ {
59+ // TODO
60+ }
61+ }
62+
63+
64+ #if ! NETFRAMEWORK
65+ static RtfToDocxConverter ( )
66+ {
67+ Encoding . RegisterProvider ( CodePagesEncodingProvider . Instance ) ;
3868 }
39- }
69+ #endif
70+ }
0 commit comments