1 /**
2  * Parse .mo files and find translated messages.
3  * Authors:
4  *  $(LINK2 https://github.com/FreeSlave, Roman Chistokhodov)
5  * Copyright:
6  *  Roman Chistokhodov, 2018
7  * License:
8  *  $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
9  * See_Also:
10  *  $(LINK2 https://www.gnu.org/software/gettext/manual/html_node/MO-Files.html, The Format of GNU MO Files)
11  */
12 
13 module mofile;
14 ///
15 class PluralFormException : Exception
16 {
17     pure nothrow @nogc @safe this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) {
18         super(msg, file, line, nextInChain);
19     }
20 }
21 
22 ///
23 class MoFileException : Exception
24 {
25     pure nothrow @nogc @safe this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable nextInChain = null) {
26         super(msg, file, line, nextInChain);
27     }
28 }
29 
30 private @safe
31 {
32     import std.conv : parse;
33     import std.ascii;
34     enum : ushort {
35         SHL = ubyte.max + 1,
36         SHR,
37         AND,
38         OR,
39         LTE,
40         GTE,
41         EQ,
42         NEQ,
43         NUM,
44     }
45 
46     class Plural
47     {
48     pure:
49         abstract int opCall(int n = 0) const;
50         abstract Plural clone();
51     }
52 
53     class Unary : Plural
54     {
55     pure:
56         this(Plural op) {
57             op1 = op;
58         }
59     protected:
60         Plural op1;
61     }
62 
63     class Binary : Plural
64     {
65     pure:
66         this(Plural first, Plural second) {
67             op1 = first;
68             op2 = second;
69         }
70     protected:
71         Plural op1, op2;
72     }
73 
74     final class Number : Plural
75     {
76     pure:
77         this(int number) {
78             num = number;
79         }
80         override Plural clone() {
81             return new Number(num);
82         }
83         override int opCall(int) const {
84             return num;
85         }
86     private:
87         int num;
88     }
89 
90 
91     final class UnaryOp(string op) : Unary
92     {
93     pure:
94         this(Plural op1) {
95             super(op1);
96         }
97         override int opCall(int n) const {
98             return mixin(op ~ " op1(n)");
99         }
100         override Plural clone() {
101             return new UnaryOp!(op)(op1.clone());
102         }
103     }
104 
105     final class BinaryOp(string op) : Binary
106     {
107     pure:
108         this(Plural first, Plural second) {
109             super(first, second);
110         }
111         override int opCall(int n) const {
112             return mixin("op1(n)" ~ op ~ "op2(n)");
113         }
114         override Plural clone() {
115             return new BinaryOp!(op)(op1.clone(), op2.clone());
116         }
117     }
118 
119     final class BinaryOpD(string op) : Binary
120     {
121     pure:
122         this(Plural first, Plural second) {
123             super(first, second);
124         }
125         override int opCall(int n) const {
126             int v2 = op2(n);
127             if (v2 == 0) {
128                 throw new PluralFormException("Division by zero during plural form computation");
129             }
130             return mixin("op1(n)" ~ op ~ "v2");
131         }
132         override Plural clone() {
133             return new BinaryOp!(op)(op1.clone(), op2.clone());
134         }
135     }
136 
137     alias UnaryOp!"!" Not;
138     alias UnaryOp!"-" Minus;
139     alias UnaryOp!"~" Invert;
140 
141     alias BinaryOp!"*" Mul;
142     alias BinaryOpD!"/" Div;
143     alias BinaryOpD!"%" Mod;
144 
145     alias BinaryOp!"+" Add;
146     alias BinaryOp!"-" Sub;
147 
148     alias BinaryOp!"<<" Shl;
149     alias BinaryOp!">>" Shr;
150 
151     alias BinaryOp!">" Gt;
152     alias BinaryOp!"<" Lt;
153     alias BinaryOp!">=" Gte;
154     alias BinaryOp!"<=" Lte;
155 
156     alias BinaryOp!"==" Eq;
157     alias BinaryOp!"!=" Neq;
158 
159     alias BinaryOp!"&" BinAnd;
160     alias BinaryOp!"^" BinXor;
161     alias BinaryOp!"|" BinOr;
162 
163     alias BinaryOp!"&&" And;
164     alias BinaryOp!"||" Or;
165 
166     unittest
167     {
168         Plural op = new Mul(new Number(5), new Minus(new Number(10)));
169         assert(op() == -50);
170         op = new Eq(new Number(42), new Add(new Number(20), new Number(22)));
171         assert(op() == 1);
172         op = new Div(new Number(12), new Number(3));
173         assert(op() == 4);
174     }
175 
176     struct Tokenizer
177     {
178     pure:
179         this(string contents) {
180             content = contents;
181             get();
182         }
183 
184         @property ushort front() const pure nothrow @nogc {
185             return current;
186         }
187         @property bool empty() const pure nothrow @nogc {
188             return current == 0;
189         }
190         void popFront() {
191             get();
192         }
193         int getNumber() {
194             if (current == NUM)
195                 return number;
196             else
197                 throw new PluralFormException("Not a number");
198         }
199     private:
200         @trusted void get() {
201             while(content.length > pos && isWhite(content[pos])) {
202                 pos++;
203             }
204             if (pos >= content.length) {
205                 current = 0;
206                 return;
207             }
208             if (content.length >= pos+2) {
209                 pos += 2;
210                 switch(content[pos-2..pos]) {
211                     case "<<": current = SHL; return;
212                     case ">>": current = SHR; return;
213                     case "&&": current = AND; return;
214                     case "||": current = OR; return;
215                     case "<=": current = LTE; return;
216                     case ">=": current = GTE; return;
217                     case "==": current = EQ; return;
218                     case "!=": current = NEQ; return;
219                     default: pos -= 2; break;
220                 }
221             }
222             if (isDigit(content[pos])) {
223                 auto tmp = content[pos..$];
224                 number = parse!int(tmp);
225                 current = NUM;
226                 pos += tmp.ptr - (content.ptr + pos);
227             } else {
228                 current = cast(ushort)content[pos];
229                 pos++;
230             }
231         }
232 
233         int number;
234         ushort current;
235         size_t pos;
236         string content;
237     }
238 
239     unittest
240     {
241         string contents = "n %10 ==1\n";
242         auto tokenizer = Tokenizer(contents);
243         assert(!tokenizer.empty);
244         assert(tokenizer.front == 'n');
245         tokenizer.popFront();
246         assert(tokenizer.front == '%');
247         tokenizer.popFront();
248         assert(tokenizer.front == NUM);
249         assert(tokenizer.getNumber == 10);
250         tokenizer.popFront();
251         assert(tokenizer.front == EQ);
252         tokenizer.popFront();
253         assert(tokenizer.front == NUM);
254         assert(tokenizer.getNumber == 1);
255         tokenizer.popFront();
256         assert(tokenizer.empty);
257 
258         tokenizer = Tokenizer("");
259         assert(tokenizer.empty);
260     }
261 
262     final class Variable : Plural
263     {
264     pure:
265         this() {
266         }
267         override int opCall(int n) const {
268             return n;
269         }
270         override Plural clone() {
271             return new Variable();
272         }
273     }
274 
275     final class Conditional : Plural
276     {
277     pure:
278         this(Plural cond, Plural res, Plural alt) {
279             this.cond = cond;
280             this.res = res;
281             this.alt = alt;
282         }
283         override int opCall(int n) const {
284             return cond(n) ? res(n) : alt(n);
285         }
286         override Plural clone() {
287             return new Conditional(cond, res, alt);
288         }
289     private:
290         Plural cond, res, alt;
291     }
292 
293     struct Parser
294     {
295     pure:
296         this(Tokenizer tokenizer) {
297             t = tokenizer;
298         }
299 
300         this(string content) {
301             this(Tokenizer(content));
302         }
303 
304         Plural compile() {
305             Plural expr = condExpr();
306             if (expr && !t.empty) {
307                 throw new PluralFormException("Not in the end");
308             }
309             return expr;
310         }
311 
312     private:
313         Plural valueExpr() {
314             if (t.front == '(') {
315                 t.popFront();
316                 Plural op = condExpr();
317                 if (op is null)
318                     return null;
319                 if (t.front != ')')
320                     throw new PluralFormException("Missing ')' in expression");
321                 t.popFront();
322                 return op;
323             } else if (t.front == NUM) {
324                 int number = t.getNumber();
325                 t.popFront();
326                 return new Number(number);
327             } else if (t.front == 'n') {
328                 t.popFront();
329                 return new Variable();
330             } else {
331                 throw new PluralFormException("Unknown operand");
332             }
333             assert(false);
334         }
335 
336         Plural unaryExpr() {
337             Plural op1;
338             ushort op = t.front;
339             if (op == '-' || op == '~' || op == '!') {
340                 t.popFront();
341                 op1 = unaryExpr();
342                 if (op1) {
343                     switch(op) {
344                         case '-': return new Minus(op1);
345                         case '~': return new Invert(op1);
346                         case '!': return new Not(op1);
347                         default: assert(false);
348                     }
349                 } else {
350                     return null;
351                 }
352             } else {
353                 return valueExpr();
354             }
355         }
356 
357         static int getPrec(const ushort op) {
358             switch(op) {
359                 case '/':
360                 case '*':
361                 case '%':
362                     return 10;
363                 case '+':
364                 case '-':
365                     return 9;
366                 case SHL:
367                 case SHR:
368                     return 8;
369                 case '>':
370                 case '<':
371                 case GTE:
372                 case LTE:
373                     return 7;
374                 case  EQ:
375                 case NEQ:
376                     return 6;
377                 case '&':
378                     return 5;
379                 case '^':
380                     return 4;
381                 case '|':
382                     return 3;
383                 case AND:
384                     return 2;
385                 case  OR:
386                     return 1;
387                 default:
388                     return 0;
389             }
390         }
391 
392         static Plural binaryFactory(const ushort op, Plural left, Plural right) {
393             switch(op) {
394                 case '/':  return new Div(left,right);
395                 case '*':  return new Mul(left,right);
396                 case '%':  return new Mod(left,right);
397                 case '+':  return new Add(left,right);
398                 case '-':  return new Sub(left,right);
399                 case SHL:  return new Shl(left,right);
400                 case SHR:  return new Shr(left,right);
401                 case '>':  return new  Gt(left,right);
402                 case '<':  return new  Lt(left,right);
403                 case GTE:  return new Gte(left,right);
404                 case LTE:  return new Lte(left,right);
405                 case  EQ:  return new  Eq(left,right);
406                 case NEQ:  return new Neq(left,right);
407                 case '&':  return new BinAnd(left,right);
408                 case '^':  return new BinXor(left,right);
409                 case '|':  return new BinOr(left,right);
410                 case AND:  return new And(left,right);
411                 case  OR:  return new Or(left,right);
412                 default:   return null;
413             }
414         }
415 
416         Plural binaryExpr(const int prec = 1) {
417             assert(prec >= 1 && prec <= 11);
418             Plural op1,op2;
419             if (prec == 11)
420                 op1 = unaryExpr();
421             else
422                 op1 = binaryExpr(prec+1);
423             if (op1 is null)
424                 return null;
425             if (prec != 11) {
426                 while(getPrec(t.front) == prec) {
427                     ushort o = t.front;
428                     t.popFront();
429                     op2 = binaryExpr(prec+1);
430                     if (op2 is null)
431                         return null;
432                     op1 = binaryFactory(o, op1, op2);
433                 }
434             }
435 
436             return op1;
437         }
438 
439         Plural condExpr() {
440             Plural cond, case1, case2;
441             cond = binaryExpr();
442             if(cond is null)
443                 return null;
444             if(t.front == '?') {
445                 t.popFront();
446                 case1 = condExpr();
447                 if(case1 is null)
448                     return null;
449                 if(t.front != ':')
450                     throw new PluralFormException("Missing ':' in conditional operator");
451                 t.popFront();
452                 case2 = condExpr();
453                 if(case2 is null)
454                     return null;
455             } else {
456                 return cond;
457             }
458             return new Conditional(cond,case1,case2);
459         }
460 
461         Tokenizer t;
462     }
463 
464     unittest
465     {
466         auto parser = new Parser("(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)");
467         auto expr = parser.compile();
468         assert(expr !is null);
469         assert(expr(1) == 0);
470         assert(expr(101) == 0);
471         assert(expr(2) == 1);
472         assert(expr(24) == 1);
473         assert(expr(104) == 1);
474         assert(expr(222) == 1);
475         assert(expr(11) == 2);
476         assert(expr(14) == 2);
477         assert(expr(111) == 2);
478         assert(expr(210) == 2);
479 
480         import std.exception : assertThrown;
481         assertThrown!PluralFormException(new Parser("").compile());
482         assertThrown!PluralFormException(new Parser("n?1").compile());
483         assertThrown!PluralFormException(new Parser("(2-1").compile());
484         assertThrown!PluralFormException(new Parser("p").compile());
485         assertThrown!PluralFormException(new Parser("1+2;").compile());
486     }
487 }
488 
489 import std.exception : assumeUnique, enforce;
490 import std.range : iota, assumeSorted, drop, dropOne;
491 import std.algorithm.iteration : map, splitter;
492 import std.algorithm.searching : all, find, findSkip, skipOver;
493 import std.algorithm.sorting : isSorted;
494 import std.string : lineSplitter, stripRight;
495 import std.typecons : tuple;
496 
497 private enum int moMagic = 0x950412de;
498 
499 /**
500  * Struct representing .mo file.
501  *
502  * Default constructed object returns untranslated messages.
503  */
504 @safe struct MoFile
505 {
506     /**
507      * Read from file.
508      *
509      * $(D mofile.MoFileException) if data is in invalid or unsupported format.
510      * $(D mofile.PluralFormException) if plural form expression could not be parsed.
511      * $(B FileException) on file reading error.
512      */
513     @trusted this(string fileName) {
514         import std.file : read;
515         this(read(fileName).assumeUnique);
516     }
517 
518     /**
519      * Constructor from data.
520      * Data must be immutable and live as long as translated messages are used, because it's used to return strings.
521      * Throws:
522      * $(D mofile.MoFileException) if data is in invalid or unsupported format.
523      * $(D mofile.PluralFormException) if plural form expression could not be parsed.
524      */
525     @safe this(immutable(void)[] data) pure {
526         this.data = data;
527         const magic = readValue!int(0);
528         if (magic != moMagic) {
529             throw new MoFileException("Wrong magic");
530         }
531         const revision = readValue!int(int.sizeof);
532         if (revision != 0) {
533             throw new MoFileException("Unknown revision");
534         }
535 
536         baseOffsetOrig = readValue!int(int.sizeof*3);
537         baseOffsetTr = readValue!int(int.sizeof*4);
538         count = readValue!int(int.sizeof*2);
539 
540         if (count <= 0) {
541             throw new MoFileException("Invalid count of msgids, must be at least 1");
542         }
543 
544         auto mapped = iota(0,count).map!(i => getMessage(baseOffsetOrig, i));
545         enforce!MoFileException(mapped.isSorted, "Invalid .mo file: message ids are not sorted");
546         if (!mapped.empty && mapped.front.length == 0) {
547             enforce!MoFileException(mapped.dropOne.all!"!a.empty", "Some msgid besides the reserved one is empty");
548         }
549 
550         string header = getMessage(baseOffsetTr, 0);
551         foreach(line; header.lineSplitter) {
552             if (line.skipOver("Plural-Forms:")) {
553                 if (line.findSkip("plural=")) {
554                     string expr = line.stripRight("\n\r;");
555                     auto parser = new Parser(expr);
556                     compiled = parser.compile();
557                 }
558             }
559         }
560     }
561 
562     /**
563      * .mo file header that includes some info like creation date, language and translator's name.
564      */
565     string header() pure const {
566         return gettext("");
567     }
568 
569     /**
570      * Get translated message.
571      * Params:
572      *  msgid = Message id (usually untranslated string)
573      * Returns: Translated message for the msgid.
574      *  If translation for this msgid does not exist or MoFile is default constructed the msgid is returned.
575      */
576     string gettext(string msgid) pure const {
577         int index = getIndex(msgid);
578         if (index >= 0) {
579             string translated = getMessage(baseOffsetTr, index);
580             auto splitted = translated.splitter('\0');
581             if (!splitted.empty && splitted.front.length)
582                 return splitted.front;
583         }
584         return msgid;
585     }
586 
587     /**
588      * Get translated message considering plural forms.
589      * Params:
590      *  msgid = Untranslated message in singular form
591      *  msgid_plural = Untranslated message in plural form.
592      *  n = Number to calculate a plural form.
593      * Returns: Translated string in plural form dependent on number n.
594      *  If translation for this msgid does not exist or MoFile is default constructed then the msgid is returned if n == 1 and msgid_plural otherwise.
595      */
596     string ngettext(string msgid, string msgid_plural, int n) pure const {
597         int index = getIndex(msgid);
598         if (compiled !is null && index >= 0) {
599             string translated = getMessage(baseOffsetTr, index);
600             auto splitted = translated.splitter('\0');
601             if (!splitted.empty && splitted.front.length) {
602                 int pluralForm = compiled(n);
603                 auto forms = splitted.drop(pluralForm);
604                 if (!forms.empty)
605                     return forms.front;
606             }
607         }
608         return n == 1 ? msgid : msgid_plural;
609     }
610 
611 private:
612     @trusted int getIndex(string message) pure const {
613         if (data.length == 0)
614             return -1;
615         auto sorted = iota(0, count).map!(i => tuple(i, getMessageSingular(baseOffsetOrig, i))).assumeSorted!"a[1] < b[1]";
616         auto found = sorted.equalRange(tuple(0, message));
617         if (found.empty) {
618             return -1;
619         } else {
620             return found.front[0];
621         }
622     }
623 
624     @trusted T readValue(T)(size_t offset) pure const
625     {
626         if (data.length >= offset + T.sizeof) {
627             T value = *(cast(const(T)*)data[offset..(offset+T.sizeof)].ptr);
628             return value;
629         } else {
630             throw new MoFileException("Value is out of bounds");
631         }
632     }
633 
634     @trusted string readString(int len, int offset) pure const
635     {
636         if (data.length >= offset + len) {
637             string s = cast(string)data[offset..offset+len];
638             return s;
639         } else {
640             throw new MoFileException("String is out of bounds");
641         }
642     }
643 
644     @trusted string getMessage(int offset, int i) pure const {
645         return readString(readValue!int(offset + i*int.sizeof*2), readValue!int(offset + i*int.sizeof*2 + int.sizeof));
646     }
647 
648     @trusted string getMessageSingular(int offset, int i) pure const {
649         auto splitted = getMessage(offset, i).splitter('\0');
650         if (splitted.empty) {
651             return "";
652         } else {
653             return splitted.front;
654         }
655     }
656 
657     int count;
658     int baseOffsetOrig;
659     int baseOffsetTr;
660     immutable(void)[] data;
661     Plural compiled;
662 }
663 
664 unittest
665 {
666     MoFile moFile;
667     assert(moFile.header.length == 0);
668     assert(moFile.gettext("") == "");
669     assert(moFile.gettext("Hello") == "Hello");
670     assert(moFile.ngettext("File", "Files", 1) == "File");
671     assert(moFile.ngettext("File", "Files", 2) == "Files");
672     assert(moFile.ngettext("File", "Files", 0) == "Files");
673 }
674 
675 unittest
676 {
677     import std.bitmanip : nativeToLittleEndian;
678     import std.string : representation;
679     immutable(ubyte)[] moHeader = (nativeToLittleEndian(moMagic)[] ~ nativeToLittleEndian(0)[] ~ nativeToLittleEndian(1)[] ~ nativeToLittleEndian(20) ~ nativeToLittleEndian(28)).assumeUnique;
680     immutable(ubyte)[] offsets = (nativeToLittleEndian(4)[] ~ nativeToLittleEndian(36)[] ~ nativeToLittleEndian(4)[] ~ nativeToLittleEndian(40)[]).assumeUnique;
681     immutable(ubyte)[] data = moHeader ~ offsets ~ "abcd".representation ~ "efgh".representation;
682     assert(data.length == 44);
683     MoFile moFile;
684     import std.exception : assertThrown, assertNotThrown;
685     assertNotThrown(moFile = MoFile(data));
686     assert(moFile.gettext("abcd") == "efgh");
687 }