processNumbers.ts 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. private processNumbers(pinyinString: string, tag: string, code: string): string {
  2. let outputString = "";
  3. let useNumVersion = false;
  4. //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
  5. if ((tag == "245" || tag == "830") && code == "n") {
  6. useNumVersion = true;
  7. }
  8. /*
  9. * The input string is split, with any space or punctuation character (except for #) as the delimiter.
  10. * The delimiters will be captured and included in the string of tokens. Only the even-numbered
  11. * array elements are the true 'tokens', so the code for processing tokens is run only for even
  12. * values of j.
  13. */
  14. let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
  15. let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
  16. let numToken_re = new RegExp(numTokenPattern);
  17. let n = tokens.length
  18. //this.alert.info(tokens.join("|"),{autoClose: false})
  19. for (let i = 0; i < n; i++) {
  20. let toki = tokens[i];
  21. if (toki.match(numToken_re)) {
  22. /*
  23. * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
  24. * found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the
  25. * # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a
  26. * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
  27. * which version should be used in the output string. The outer loop then continues where the inner loop left off.
  28. */
  29. let textVersion = "";
  30. let numVersion = "";
  31. for (let j = i; j < n; j++) {
  32. let tokj = tokens[j];
  33. /* a token without # (or the end of string) is reached */
  34. if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
  35. //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
  36. let m = tokj.match(numToken_re);
  37. if (m) {
  38. textVersion += m[1]
  39. if (m[2] == "") {
  40. numVersion += m[1];
  41. } else {
  42. numVersion += m[2];
  43. }
  44. } else if (j == n - 1) {
  45. //if last token is non-numerical, just tack it on.
  46. textVersion += tokj;
  47. numVersion += tokj;
  48. } else if (textVersion.length > 0 && numVersion.length > 0) {
  49. //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
  50. //(outer loop will pick up at this point)
  51. textVersion = textVersion.substring(0, textVersion.length - 1);
  52. numVersion = numVersion.substring(0, numVersion.length - 1);
  53. }
  54. //evaluate numerical string that has been constructed so far
  55. //use num version for ordinals and date strings
  56. if (numVersion.match(/^di [0-9]/i) ||
  57. numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
  58. numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
  59. numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
  60. useNumVersion
  61. ) {
  62. useNumVersion = true;
  63. /*
  64. * At this point, string may contain literal translations of Chinese numerals
  65. * Convert these to Arabic numerals (for example "2 10 7" = "27").
  66. */
  67. while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
  68. m = numVersion.match(/([0-9]+) ([1-9]0+)/);
  69. if (m) {
  70. let sum = Number(m[1]) * Number(m[2]);
  71. numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
  72. } else {
  73. let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
  74. if (mb)
  75. {
  76. let sumb = Number(mb[1]) + Number(mb[2]);
  77. numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
  78. }
  79. else
  80. {
  81. break;
  82. }
  83. }
  84. }
  85. //A few other tweaks
  86. numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
  87. if ((tag == "245" || tag == "830") && code == "n") {
  88. while (numVersion.match(/[0-9] [0-9]/)) {
  89. numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
  90. }
  91. }
  92. }
  93. if (useNumVersion)
  94. {
  95. outputString += numVersion;
  96. }
  97. else
  98. {
  99. outputString += textVersion;
  100. }
  101. //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
  102. //(i.e. two tokens ago)
  103. if (j < n - 1)
  104. {
  105. i = j - 2;
  106. }
  107. else //we are at the end of the string, so we are done!
  108. {
  109. i = j;
  110. }
  111. break;
  112. }
  113. //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
  114. //This is identical to the code that is run above when the last token is numeric.
  115. if (j % 2 == 0)
  116. {
  117. let m = tokj.match(numToken_re);
  118. textVersion += m[1];
  119. if (m[2]== "")
  120. {
  121. numVersion += m[1];
  122. }
  123. else
  124. {
  125. numVersion += m[2];
  126. }
  127. }
  128. else //a delimiter, just tack it on.
  129. {
  130. textVersion += tokj;
  131. numVersion += tokj;
  132. }
  133. }
  134. }
  135. else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
  136. {
  137. outputString += toki;
  138. }
  139. }
  140. return outputString;
  141. }