1 /**
2  * Parse .mo files and find translated messages.
3  * Authors:
4  *  $(LINK2 https://github.com/FreeSlave, Roman Chistokhodov)
5  * Copyright:
6  *  Roman Chistokhodov, 2018
7  * License:
8  *  $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
9  * See_Also:
10  *  $(LINK2 https://www.gnu.org/software/gettext/manual/html_node/MO-Files.html, The Format of GNU MO Files)
11  */
12 
13 module mofile;
14 ///
15 class PluralFormException : Exception
16 {
17     pure nothrow @nogc @safe this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) {
18         super(msg, file, line, nextInChain);
19     }
20 }
21 
22 ///
23 class MoFileException : Exception
24 {
25     pure nothrow @nogc @safe this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) {
26         super(msg, file, line, nextInChain);
27     }
28 }
29 
30 private @safe
31 {
32     import std.conv : parse;
33     import std.ascii;
34     enum : ushort {
35         SHL = ubyte.max + 1,
36         SHR,
37         AND,
38         OR,
39         LTE,
40         GTE,
41         EQ,
42         NEQ,
43         NUM,
44     }
45 
46     class Plural
47     {
48     pure:
49         abstract int opCall(int n = 0) const;
50         abstract Plural clone();
51     }
52 
53     class Unary : Plural
54     {
55     pure:
56         this(Plural op) {
57             op1 = op;
58         }
59     protected:
60         Plural op1;
61     }
62 
63     class Binary : Plural
64     {
65     pure:
66         this(Plural first, Plural second) {
67             op1 = first;
68             op2 = second;
69         }
70     protected:
71         Plural op1, op2;
72     }
73 
74     final class Number : Plural
75     {
76     pure:
77         this(int number) {
78             num = number;
79         }
80         override Plural clone() {
81             return new Number(num);
82         }
83         override int opCall(int) const {
84             return num;
85         }
86     private:
87         int num;
88     }
89 
90 
91     final class UnaryOp(string op) : Unary
92     {
93     pure:
94         this(Plural op1) {
95             super(op1);
96         }
97         override int opCall(int n) const {
98             return mixin(op ~ " op1(n)");
99         }
100         override Plural clone() {
101             return new UnaryOp!(op)(op1.clone());
102         }
103     }
104 
105     final class BinaryOp(string op) : Binary
106     {
107     pure:
108         this(Plural first, Plural second) {
109             super(first, second);
110         }
111         override int opCall(int n) const {
112             return mixin("op1(n)" ~ op ~ "op2(n)");
113         }
114         override Plural clone() {
115             return new BinaryOp!(op)(op1.clone(), op2.clone());
116         }
117     }
118 
119     final class BinaryOpD(string op) : Binary
120     {
121     pure:
122         this(Plural first, Plural second) {
123             super(first, second);
124         }
125         override int opCall(int n) const {
126             int v2 = op2(n);
127             if (v2 == 0) {
128                 throw new PluralFormException("Division by zero during plural form computation");
129             }
130             return mixin("op1(n)" ~ op ~ "v2");
131         }
132         override Plural clone() {
133             return new BinaryOp!(op)(op1.clone(), op2.clone());
134         }
135     }
136 
137     alias UnaryOp!"!" Not;
138     alias UnaryOp!"-" Minus;
139     alias UnaryOp!"~" Invert;
140 
141     alias BinaryOp!"*" Mul;
142     alias BinaryOpD!"/" Div;
143     alias BinaryOpD!"%" Mod;
144 
145     alias BinaryOp!"+" Add;
146     alias BinaryOp!"-" Sub;
147 
148     alias BinaryOp!"<<" Shl;
149     alias BinaryOp!">>" Shr;
150 
151     alias BinaryOp!">" Gt;
152     alias BinaryOp!"<" Lt;
153     alias BinaryOp!">=" Gte;
154     alias BinaryOp!"<=" Lte;
155 
156     alias BinaryOp!"==" Eq;
157     alias BinaryOp!"!=" Neq;
158 
159     alias BinaryOp!"&" BinAnd;
160     alias BinaryOp!"^" BinXor;
161     alias BinaryOp!"|" BinOr;
162 
163     alias BinaryOp!"&&" And;
164     alias BinaryOp!"||" Or;
165 
166     unittest
167     {
168         Plural op = new Mul(new Number(5), new Minus(new Number(10)));
169         assert(op() == -50);
170         op = new Eq(new Number(42), new Add(new Number(20), new Number(22)));
171         assert(op() == 1);
172         op = new Div(new Number(12), new Number(3));
173         assert(op() == 4);
174     }
175 
176     struct Tokenizer
177     {
178     pure:
179         this(string contents) {
180             content = contents;
181             get();
182         }
183 
184         @property ushort front() const pure nothrow @nogc {
185             return current;
186         }
187         @property bool empty() const pure nothrow @nogc {
188             return current == 0;
189         }
190         void popFront() {
191             get();
192         }
193         int getNumber() {
194             if (current == NUM)
195                 return number;
196             else
197                 throw new PluralFormException("Not a number");
198         }
199     private:
200         @trusted void get() {
201             while(content.length > pos && isWhite(content[pos])) {
202                 pos++;
203             }
204             if (pos >= content.length) {
205                 current = 0;
206                 return;
207             }
208             if (content.length >= pos+2) {
209                 pos += 2;
210                 switch(content[pos-2..pos]) {
211                     case "<<": current = SHL; return;
212                     case ">>": current = SHR; return;
213                     case "&&": current = AND; return;
214                     case "||": current = OR; return;
215                     case "<=": current = LTE; return;
216                     case ">=": current = GTE; return;
217                     case "==": current = EQ; return;
218                     case "!=": current = NEQ; return;
219                     default: pos -= 2; break;
220                 }
221             }
222             if (isDigit(content[pos])) {
223                 auto tmp = content[pos..$];
224                 number = parse!int(tmp);
225                 current = NUM;
226                 pos += tmp.ptr - (content.ptr + pos);
227             } else {
228                 current = cast(ushort)content[pos];
229                 pos++;
230             }
231         }
232 
233         int number;
234         ushort current;
235         size_t pos;
236         string content;
237     }
238 
239     unittest
240     {
241         string contents = "n %10 ==1\n";
242         auto tokenizer = Tokenizer(contents);
243         assert(!tokenizer.empty);
244         assert(tokenizer.front == 'n');
245         tokenizer.popFront();
246         assert(tokenizer.front == '%');
247         tokenizer.popFront();
248         assert(tokenizer.front == NUM);
249         assert(tokenizer.getNumber == 10);
250         tokenizer.popFront();
251         assert(tokenizer.front == EQ);
252         tokenizer.popFront();
253         assert(tokenizer.front == NUM);
254         assert(tokenizer.getNumber == 1);
255         tokenizer.popFront();
256         assert(tokenizer.empty);
257 
258         tokenizer = Tokenizer("");
259         assert(tokenizer.empty);
260     }
261 
262     final class Variable : Plural
263     {
264     pure:
265         this() {
266         }
267         override int opCall(int n) const {
268             return n;
269         }
270         override Plural clone() {
271             return new Variable();
272         }
273     }
274 
275     final class Conditional : Plural
276     {
277     pure:
278         this(Plural cond, Plural res, Plural alt) {
279             this.cond = cond;
280             this.res = res;
281             this.alt = alt;
282         }
283         override int opCall(int n) const {
284             return cond(n) ? res(n) : alt(n);
285         }
286         override Plural clone() {
287             return new Conditional(cond, res, alt);
288         }
289     private:
290         Plural cond, res, alt;
291     }
292 
293     struct Parser
294     {
295     pure:
296         this(Tokenizer tokenizer) {
297             t = tokenizer;
298         }
299 
300         this(string content) {
301             this(Tokenizer(content));
302         }
303 
304         Plural compile() {
305             Plural expr = condExpr();
306             if (expr && !t.empty) {
307                 throw new PluralFormException("Not in the end");
308             }
309             return expr;
310         }
311 
312     private:
313         Plural valueExpr() {
314             if (t.front == '(') {
315                 t.popFront();
316                 Plural op = condExpr();
317                 if (op is null)
318                     return null;
319                 if (t.front != ')')
320                     throw new PluralFormException("Missing ')' in expression");
321                 t.popFront();
322                 return op;
323             } else if (t.front == NUM) {
324                 int number = t.getNumber();
325                 t.popFront();
326                 return new Number(number);
327             } else if (t.front == 'n') {
328                 t.popFront();
329                 return new Variable();
330             } else {
331                 throw new PluralFormException("Unknown operand");
332             }
333             assert(false);
334         }
335 
336         Plural unaryExpr() {
337             Plural op1;
338             ushort op = t.front;
339             if (op == '-' || op == '~' || op == '!') {
340                 t.popFront();
341                 op1 = unaryExpr();
342                 if (op1) {
343                     switch(op) {
344                         case '-': return new Minus(op1);
345                         case '~': return new Invert(op1);
346                         case '!': return new Not(op1);
347                         default: assert(false);
348                     }
349                 } else {
350                     return null;
351                 }
352             } else {
353                 return valueExpr();
354             }
355         }
356 
357         static int getPrec(const ushort op) {
358             switch(op) {
359                 case '/':
360                 case '*':
361                 case '%':
362                     return 10;
363                 case '+':
364                 case '-':
365                     return 9;
366                 case SHL:
367                 case SHR:
368                     return 8;
369                 case '>':
370                 case '<':
371                 case GTE:
372                 case LTE:
373                     return 7;
374                 case  EQ:
375                 case NEQ:
376                     return 6;
377                 case '&':
378                     return 5;
379                 case '^':
380                     return 4;
381                 case '|':
382                     return 3;
383                 case AND:
384                     return 2;
385                 case  OR:
386                     return 1;
387                 default:
388                     return 0;
389             }
390         }
391 
392         static Plural binaryFactory(const ushort op, Plural left, Plural right) {
393             switch(op) {
394                 case '/':  return new Div(left,right);
395                 case '*':  return new Mul(left,right);
396                 case '%':  return new Mod(left,right);
397                 case '+':  return new Add(left,right);
398                 case '-':  return new Sub(left,right);
399                 case SHL:  return new Shl(left,right);
400                 case SHR:  return new Shr(left,right);
401                 case '>':  return new  Gt(left,right);
402                 case '<':  return new  Lt(left,right);
403                 case GTE:  return new Gte(left,right);
404                 case LTE:  return new Lte(left,right);
405                 case  EQ:  return new  Eq(left,right);
406                 case NEQ:  return new Neq(left,right);
407                 case '&':  return new BinAnd(left,right);
408                 case '^':  return new BinXor(left,right);
409                 case '|':  return new BinOr(left,right);
410                 case AND:  return new And(left,right);
411                 case  OR:  return new Or(left,right);
412                 default:   return null;
413             }
414         }
415 
416         Plural binaryExpr(const int prec = 1) {
417             assert(prec >= 1 && prec <= 11);
418             Plural op1,op2;
419             if (prec == 11)
420                 op1 = unaryExpr();
421             else
422                 op1 = binaryExpr(prec+1);
423             if (op1 is null)
424                 return null;
425             if (prec != 11) {
426                 while(getPrec(t.front) == prec) {
427                     ushort o = t.front;
428                     t.popFront();
429                     op2 = binaryExpr(prec+1);
430                     if (op2 is null)
431                         return null;
432                     op1 = binaryFactory(o, op1, op2);
433                 }
434             }
435 
436             return op1;
437         }
438 
439         Plural condExpr() {
440             Plural cond, case1, case2;
441             cond = binaryExpr();
442             if(cond is null)
443                 return null;
444             if(t.front == '?') {
445                 t.popFront();
446                 case1 = condExpr();
447                 if(case1 is null)
448                     return null;
449                 if(t.front != ':')
450                     throw new PluralFormException("Missing ':' in conditional operator");
451                 t.popFront();
452                 case2 = condExpr();
453                 if(case2 is null)
454                     return null;
455             } else {
456                 return cond;
457             }
458             return new Conditional(cond,case1,case2);
459         }
460 
461         Tokenizer t;
462     }
463 
464     unittest
465     {
466         auto parser = new Parser("(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)");
467         auto expr = parser.compile();
468         assert(expr !is null);
469         assert(expr(1) == 0);
470         assert(expr(101) == 0);
471         assert(expr(2) == 1);
472         assert(expr(24) == 1);
473         assert(expr(104) == 1);
474         assert(expr(222) == 1);
475         assert(expr(11) == 2);
476         assert(expr(14) == 2);
477         assert(expr(111) == 2);
478         assert(expr(210) == 2);
479 
480         import std.exception : assertThrown;
481         assertThrown!PluralFormException(new Parser("").compile());
482         assertThrown!PluralFormException(new Parser("n?1").compile());
483         assertThrown!PluralFormException(new Parser("(2-1").compile());
484         assertThrown!PluralFormException(new Parser("p").compile());
485         assertThrown!PluralFormException(new Parser("1+2;").compile());
486     }
487 }
488 
489 import std.exception : assumeUnique, enforce;
490 import std.range : iota, assumeSorted, drop, dropOne;
491 import std.algorithm.iteration : map, splitter;
492 import std.algorithm.searching : all, find, findSkip, skipOver;
493 import std.algorithm.sorting : isSorted;
494 import std.string : lineSplitter;
495 import std.typecons : tuple;
496 
497 private enum int moMagic = 0x950412de;
498 
499 /**
500  * Struct representing .mo file.
501  *
502  * Default constructed object returns untranslated messages.
503  */
504 @safe struct MoFile
505 {
506     /**
507      * Read from file.
508      *
509      * $(D mofile.MoFileException) if data is in invalid or unsupported format.
510      * $(D mofile.PluralFormException) if plural form expression could not be parsed.
511      * $(B FileException) on file reading error.
512      */
513     @trusted this(string fileName) {
514         import std.file : read;
515         this(read(fileName).assumeUnique);
516     }
517 
518     private static string stripRight(string line, string chars) nothrow @safe pure {
519         import std.string : indexOf;
520         for (; line.length > 0; line = line[0 .. $ - 1]) {
521             if (chars.indexOf(line[$ - 1]) == -1)
522                 break;
523         }
524         return line;
525     }
526     unittest
527     {
528         assert(stripRight("hello\n\r", "\n\r") == "hello");
529         assert(stripRight("hello\r\n", "\n\r") == "hello");
530         assert(stripRight("hello\r", "\n\r") == "hello");
531         assert(stripRight("hello\n", "\n\r") == "hello");
532         assert(stripRight("hello", "\n\r") == "hello");
533     }
534 
535     /**
536      * Constructor from data.
537      * Data must be immutable and live as long as translated messages are used, because it's used to return strings.
538      * Throws:
539      * $(D mofile.MoFileException) if data is in invalid or unsupported format.
540      * $(D mofile.PluralFormException) if plural form expression could not be parsed.
541      */
542     @safe this(immutable(void)[] data) pure {
543         this.data = data;
544         const magic = readValue!int(0);
545         if (magic != moMagic) {
546             throw new MoFileException("Wrong magic");
547         }
548         const revision = readValue!int(int.sizeof);
549         if (revision != 0) {
550             throw new MoFileException("Unknown revision");
551         }
552 
553         baseOffsetOrig = readValue!int(int.sizeof*3);
554         baseOffsetTr = readValue!int(int.sizeof*4);
555         count = readValue!int(int.sizeof*2);
556 
557         if (count <= 0) {
558             throw new MoFileException("Invalid count of msgids, must be at least 1");
559         }
560 
561         auto mapped = iota(0,count).map!(i => getMessage(baseOffsetOrig, i));
562         enforce!MoFileException(mapped.isSorted, "Invalid .mo file: message ids are not sorted");
563         if (!mapped.empty && mapped.front.length == 0) {
564             enforce!MoFileException(mapped.dropOne.all!"!a.empty", "Some msgid besides the reserved one is empty");
565         }
566 
567         string header = getMessage(baseOffsetTr, 0);
568         foreach(line; header.lineSplitter) {
569             if (line.skipOver("Plural-Forms:")) {
570                 if (line.findSkip("plural=")) {
571                     string expr = stripRight(line, "\n\r;");
572                     auto parser = new Parser(expr);
573                     compiled = parser.compile();
574                 }
575             }
576         }
577     }
578 
579     /**
580      * .mo file header that includes some info like creation date, language and translator's name.
581      */
582     string header() pure const {
583         return gettext("");
584     }
585 
586     /**
587      * Get translated message.
588      * Params:
589      *  msgid = Message id (usually untranslated string)
590      * Returns: Translated message for the msgid.
591      *  If translation for this msgid does not exist or MoFile is default constructed the msgid is returned.
592      */
593     string gettext(string msgid) pure const {
594         int index = getIndex(msgid);
595         if (index >= 0) {
596             string translated = getMessage(baseOffsetTr, index);
597             auto splitted = translated.splitter('\0');
598             if (!splitted.empty && splitted.front.length)
599                 return splitted.front;
600         }
601         return msgid;
602     }
603 
604     /**
605      * Get translated message considering plural forms.
606      * Params:
607      *  msgid = Untranslated message in singular form
608      *  msgid_plural = Untranslated message in plural form.
609      *  n = Number to calculate a plural form.
610      * Returns: Translated string in plural form dependent on number n.
611      *  If translation for this msgid does not exist or MoFile is default constructed then the msgid is returned if n == 1 and msgid_plural otherwise.
612      */
613     string ngettext(string msgid, string msgid_plural, int n) pure const {
614         int index = getIndex(msgid);
615         if (compiled !is null && index >= 0) {
616             string translated = getMessage(baseOffsetTr, index);
617             auto splitted = translated.splitter('\0');
618             if (!splitted.empty && splitted.front.length) {
619                 int pluralForm = compiled(n);
620                 auto forms = splitted.drop(pluralForm);
621                 if (!forms.empty)
622                     return forms.front;
623             }
624         }
625         return n == 1 ? msgid : msgid_plural;
626     }
627 
628 private:
629     @trusted int getIndex(string message) pure const {
630         if (data.length == 0)
631             return -1;
632         auto sorted = iota(0, count).map!(i => tuple(i, getMessageSingular(baseOffsetOrig, i))).assumeSorted!"a[1] < b[1]";
633         auto found = sorted.equalRange(tuple(0, message));
634         if (found.empty) {
635             return -1;
636         } else {
637             return found.front[0];
638         }
639     }
640 
641     @trusted T readValue(T)(size_t offset) pure const
642     {
643         if (data.length >= offset + T.sizeof) {
644             T value = *(cast(const(T)*)data[offset..(offset+T.sizeof)].ptr);
645             return value;
646         } else {
647             throw new MoFileException("Value is out of bounds");
648         }
649     }
650 
651     @trusted string readString(int len, int offset) pure const
652     {
653         if (data.length >= offset + len) {
654             string s = cast(string)data[offset..offset+len];
655             return s;
656         } else {
657             throw new MoFileException("String is out of bounds");
658         }
659     }
660 
661     @trusted string getMessage(int offset, int i) pure const {
662         return readString(readValue!int(offset + i*int.sizeof*2), readValue!int(offset + i*int.sizeof*2 + int.sizeof));
663     }
664 
665     @trusted string getMessageSingular(int offset, int i) pure const {
666         auto splitted = getMessage(offset, i).splitter('\0');
667         if (splitted.empty) {
668             return "";
669         } else {
670             return splitted.front;
671         }
672     }
673 
674     int count;
675     int baseOffsetOrig;
676     int baseOffsetTr;
677     immutable(void)[] data;
678     Plural compiled;
679 }
680 
681 unittest
682 {
683     MoFile moFile;
684     assert(moFile.header.length == 0);
685     assert(moFile.gettext("") == "");
686     assert(moFile.gettext("Hello") == "Hello");
687     assert(moFile.ngettext("File", "Files", 1) == "File");
688     assert(moFile.ngettext("File", "Files", 2) == "Files");
689     assert(moFile.ngettext("File", "Files", 0) == "Files");
690 }
691 
692 unittest
693 {
694     import std.bitmanip : nativeToLittleEndian;
695     import std.string : representation;
696     immutable(ubyte)[] moHeader = (nativeToLittleEndian(moMagic)[] ~ nativeToLittleEndian(0)[] ~ nativeToLittleEndian(1)[] ~ nativeToLittleEndian(20) ~ nativeToLittleEndian(28)).assumeUnique;
697     immutable(ubyte)[] offsets = (nativeToLittleEndian(4)[] ~ nativeToLittleEndian(36)[] ~ nativeToLittleEndian(4)[] ~ nativeToLittleEndian(40)[]).assumeUnique;
698     immutable(ubyte)[] data = moHeader ~ offsets ~ "abcd".representation ~ "efgh".representation;
699     assert(data.length == 44);
700     MoFile moFile;
701     import std.exception : assertThrown, assertNotThrown;
702     assertNotThrown(moFile = MoFile(data));
703     assert(moFile.gettext("abcd") == "efgh");
704 }