deterministicGrouping.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. /*
  2. MIT License http://www.opensource.org/licenses/mit-license.php
  3. Author Tobias Koppers @sokra
  4. */
  5. "use strict";
  6. // Simulations show these probabilities for a single change
  7. // 93.1% that one group is invalidated
  8. // 4.8% that two groups are invalidated
  9. // 1.1% that 3 groups are invalidated
  10. // 0.1% that 4 or more groups are invalidated
  11. //
  12. // And these for removing/adding 10 lexically adjacent files
  13. // 64.5% that one group is invalidated
  14. // 24.8% that two groups are invalidated
  15. // 7.8% that 3 groups are invalidated
  16. // 2.7% that 4 or more groups are invalidated
  17. //
  18. // And these for removing/adding 3 random files
  19. // 0% that one group is invalidated
  20. // 3.7% that two groups are invalidated
  21. // 80.8% that 3 groups are invalidated
  22. // 12.3% that 4 groups are invalidated
  23. // 3.2% that 5 or more groups are invalidated
  24. /**
  25. *
  26. * @param {string} a key
  27. * @param {string} b key
  28. * @returns {number} the similarity as number
  29. */
  30. const similarity = (a, b) => {
  31. const l = Math.min(a.length, b.length);
  32. let dist = 0;
  33. for (let i = 0; i < l; i++) {
  34. const ca = a.charCodeAt(i);
  35. const cb = b.charCodeAt(i);
  36. dist += Math.max(0, 10 - Math.abs(ca - cb));
  37. }
  38. return dist;
  39. };
  40. /**
  41. * @param {string} a key
  42. * @param {string} b key
  43. * @param {Set<string>} usedNames set of already used names
  44. * @returns {string} the common part and a single char for the difference
  45. */
  46. const getName = (a, b, usedNames) => {
  47. const l = Math.min(a.length, b.length);
  48. let i = 0;
  49. while (i < l) {
  50. if (a.charCodeAt(i) !== b.charCodeAt(i)) {
  51. i++;
  52. break;
  53. }
  54. i++;
  55. }
  56. while (i < l) {
  57. const name = a.slice(0, i);
  58. const lowerName = name.toLowerCase();
  59. if (!usedNames.has(lowerName)) {
  60. usedNames.add(lowerName);
  61. return name;
  62. }
  63. i++;
  64. }
  65. // names always contain a hash, so this is always unique
  66. // we don't need to check usedNames nor add it
  67. return a;
  68. };
  69. /**
  70. * @param {Record<string, number>} total total size
  71. * @param {Record<string, number>} size single size
  72. * @returns {void}
  73. */
  74. const addSizeTo = (total, size) => {
  75. for (const key of Object.keys(size)) {
  76. total[key] = (total[key] || 0) + size[key];
  77. }
  78. };
  79. /**
  80. * @param {Record<string, number>} total total size
  81. * @param {Record<string, number>} size single size
  82. * @returns {void}
  83. */
  84. const subtractSizeFrom = (total, size) => {
  85. for (const key of Object.keys(size)) {
  86. total[key] -= size[key];
  87. }
  88. };
  89. /**
  90. * @template T
  91. * @param {Iterable<Node<T>>} nodes some nodes
  92. * @returns {Record<string, number>} total size
  93. */
  94. const sumSize = nodes => {
  95. const sum = Object.create(null);
  96. for (const node of nodes) {
  97. addSizeTo(sum, node.size);
  98. }
  99. return sum;
  100. };
  101. /**
  102. * @param {Record<string, number>} size size
  103. * @param {Record<string, number>} maxSize minimum size
  104. * @returns {boolean} true, when size is too big
  105. */
  106. const isTooBig = (size, maxSize) => {
  107. for (const key of Object.keys(size)) {
  108. const s = size[key];
  109. if (s === 0) continue;
  110. const maxSizeValue = maxSize[key];
  111. if (typeof maxSizeValue === "number") {
  112. if (s > maxSizeValue) return true;
  113. }
  114. }
  115. return false;
  116. };
  117. /**
  118. * @param {Record<string, number>} size size
  119. * @param {Record<string, number>} minSize minimum size
  120. * @returns {boolean} true, when size is too small
  121. */
  122. const isTooSmall = (size, minSize) => {
  123. for (const key of Object.keys(size)) {
  124. const s = size[key];
  125. if (s === 0) continue;
  126. const minSizeValue = minSize[key];
  127. if (typeof minSizeValue === "number") {
  128. if (s < minSizeValue) return true;
  129. }
  130. }
  131. return false;
  132. };
  133. /**
  134. * @param {Record<string, number>} size size
  135. * @param {Record<string, number>} minSize minimum size
  136. * @returns {Set<string>} set of types that are too small
  137. */
  138. const getTooSmallTypes = (size, minSize) => {
  139. const types = new Set();
  140. for (const key of Object.keys(size)) {
  141. const s = size[key];
  142. if (s === 0) continue;
  143. const minSizeValue = minSize[key];
  144. if (typeof minSizeValue === "number") {
  145. if (s < minSizeValue) types.add(key);
  146. }
  147. }
  148. return types;
  149. };
  150. /**
  151. * @template T
  152. * @param {TODO} size size
  153. * @param {Set<string>} types types
  154. * @returns {number} number of matching size types
  155. */
  156. const getNumberOfMatchingSizeTypes = (size, types) => {
  157. let i = 0;
  158. for (const key of Object.keys(size)) {
  159. if (size[key] !== 0 && types.has(key)) i++;
  160. }
  161. return i;
  162. };
  163. /**
  164. * @param {Record<string, number>} size size
  165. * @param {Set<string>} types types
  166. * @returns {number} selective size sum
  167. */
  168. const selectiveSizeSum = (size, types) => {
  169. let sum = 0;
  170. for (const key of Object.keys(size)) {
  171. if (size[key] !== 0 && types.has(key)) sum += size[key];
  172. }
  173. return sum;
  174. };
  175. /**
  176. * @template T
  177. */
  178. class Node {
  179. /**
  180. * @param {T} item item
  181. * @param {string} key key
  182. * @param {Record<string, number>} size size
  183. */
  184. constructor(item, key, size) {
  185. this.item = item;
  186. this.key = key;
  187. this.size = size;
  188. }
  189. }
  190. /**
  191. * @template T
  192. */
  193. class Group {
  194. /**
  195. * @param {Node<T>[]} nodes nodes
  196. * @param {number[] | null} similarities similarities between the nodes (length = nodes.length - 1)
  197. * @param {Record<string, number>=} size size of the group
  198. */
  199. constructor(nodes, similarities, size) {
  200. this.nodes = nodes;
  201. this.similarities = similarities;
  202. this.size = size || sumSize(nodes);
  203. /** @type {string | undefined} */
  204. this.key = undefined;
  205. }
  206. /**
  207. * @param {function(Node<T>): boolean} filter filter function
  208. * @returns {Node<T>[] | undefined} removed nodes
  209. */
  210. popNodes(filter) {
  211. const newNodes = [];
  212. const newSimilarities = [];
  213. const resultNodes = [];
  214. let lastNode;
  215. for (let i = 0; i < this.nodes.length; i++) {
  216. const node = this.nodes[i];
  217. if (filter(node)) {
  218. resultNodes.push(node);
  219. } else {
  220. if (newNodes.length > 0) {
  221. newSimilarities.push(
  222. lastNode === this.nodes[i - 1]
  223. ? /** @type {number[]} */ (this.similarities)[i - 1]
  224. : similarity(lastNode.key, node.key)
  225. );
  226. }
  227. newNodes.push(node);
  228. lastNode = node;
  229. }
  230. }
  231. if (resultNodes.length === this.nodes.length) return undefined;
  232. this.nodes = newNodes;
  233. this.similarities = newSimilarities;
  234. this.size = sumSize(newNodes);
  235. return resultNodes;
  236. }
  237. }
  238. /**
  239. * @template T
  240. * @param {Iterable<Node<T>>} nodes nodes
  241. * @returns {number[]} similarities
  242. */
  243. const getSimilarities = nodes => {
  244. // calculate similarities between lexically adjacent nodes
  245. /** @type {number[]} */
  246. const similarities = [];
  247. let last = undefined;
  248. for (const node of nodes) {
  249. if (last !== undefined) {
  250. similarities.push(similarity(last.key, node.key));
  251. }
  252. last = node;
  253. }
  254. return similarities;
  255. };
  256. /**
  257. * @template T
  258. * @typedef {object} GroupedItems<T>
  259. * @property {string} key
  260. * @property {T[]} items
  261. * @property {Record<string, number>} size
  262. */
  263. /**
  264. * @template T
  265. * @typedef {object} Options
  266. * @property {Record<string, number>} maxSize maximum size of a group
  267. * @property {Record<string, number>} minSize minimum size of a group (preferred over maximum size)
  268. * @property {Iterable<T>} items a list of items
  269. * @property {function(T): Record<string, number>} getSize function to get size of an item
  270. * @property {function(T): string} getKey function to get the key of an item
  271. */
  272. /**
  273. * @template T
  274. * @param {Options<T>} options options object
  275. * @returns {GroupedItems<T>[]} grouped items
  276. */
  277. module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
  278. /** @type {Group<T>[]} */
  279. const result = [];
  280. const nodes = Array.from(
  281. items,
  282. item => new Node(item, getKey(item), getSize(item))
  283. );
  284. /** @type {Node<T>[]} */
  285. const initialNodes = [];
  286. // lexically ordering of keys
  287. nodes.sort((a, b) => {
  288. if (a.key < b.key) return -1;
  289. if (a.key > b.key) return 1;
  290. return 0;
  291. });
  292. // return nodes bigger than maxSize directly as group
  293. // But make sure that minSize is not violated
  294. for (const node of nodes) {
  295. if (isTooBig(node.size, maxSize) && !isTooSmall(node.size, minSize)) {
  296. result.push(new Group([node], []));
  297. } else {
  298. initialNodes.push(node);
  299. }
  300. }
  301. if (initialNodes.length > 0) {
  302. const initialGroup = new Group(initialNodes, getSimilarities(initialNodes));
  303. /**
  304. * @param {Group<T>} group group
  305. * @param {Record<string, number>} consideredSize size of the group to consider
  306. * @returns {boolean} true, if the group was modified
  307. */
  308. const removeProblematicNodes = (group, consideredSize = group.size) => {
  309. const problemTypes = getTooSmallTypes(consideredSize, minSize);
  310. if (problemTypes.size > 0) {
  311. // We hit an edge case where the working set is already smaller than minSize
  312. // We merge problematic nodes with the smallest result node to keep minSize intact
  313. const problemNodes = group.popNodes(
  314. n => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  315. );
  316. if (problemNodes === undefined) return false;
  317. // Only merge it with result nodes that have the problematic size type
  318. const possibleResultGroups = result.filter(
  319. n => getNumberOfMatchingSizeTypes(n.size, problemTypes) > 0
  320. );
  321. if (possibleResultGroups.length > 0) {
  322. const bestGroup = possibleResultGroups.reduce((min, group) => {
  323. const minMatches = getNumberOfMatchingSizeTypes(min, problemTypes);
  324. const groupMatches = getNumberOfMatchingSizeTypes(
  325. group,
  326. problemTypes
  327. );
  328. if (minMatches !== groupMatches)
  329. return minMatches < groupMatches ? group : min;
  330. if (
  331. selectiveSizeSum(min.size, problemTypes) >
  332. selectiveSizeSum(group.size, problemTypes)
  333. )
  334. return group;
  335. return min;
  336. });
  337. for (const node of problemNodes) bestGroup.nodes.push(node);
  338. bestGroup.nodes.sort((a, b) => {
  339. if (a.key < b.key) return -1;
  340. if (a.key > b.key) return 1;
  341. return 0;
  342. });
  343. } else {
  344. // There are no other nodes with the same size types
  345. // We create a new group and have to accept that it's smaller than minSize
  346. result.push(new Group(problemNodes, null));
  347. }
  348. return true;
  349. } else {
  350. return false;
  351. }
  352. };
  353. if (initialGroup.nodes.length > 0) {
  354. const queue = [initialGroup];
  355. while (queue.length) {
  356. const group = /** @type {Group<T>} */ (queue.pop());
  357. // only groups bigger than maxSize need to be splitted
  358. if (!isTooBig(group.size, maxSize)) {
  359. result.push(group);
  360. continue;
  361. }
  362. // If the group is already too small
  363. // we try to work only with the unproblematic nodes
  364. if (removeProblematicNodes(group)) {
  365. // This changed something, so we try this group again
  366. queue.push(group);
  367. continue;
  368. }
  369. // find unsplittable area from left and right
  370. // going minSize from left and right
  371. // at least one node need to be included otherwise we get stuck
  372. let left = 1;
  373. let leftSize = Object.create(null);
  374. addSizeTo(leftSize, group.nodes[0].size);
  375. while (left < group.nodes.length && isTooSmall(leftSize, minSize)) {
  376. addSizeTo(leftSize, group.nodes[left].size);
  377. left++;
  378. }
  379. let right = group.nodes.length - 2;
  380. let rightSize = Object.create(null);
  381. addSizeTo(rightSize, group.nodes[group.nodes.length - 1].size);
  382. while (right >= 0 && isTooSmall(rightSize, minSize)) {
  383. addSizeTo(rightSize, group.nodes[right].size);
  384. right--;
  385. }
  386. // left v v right
  387. // [ O O O ] O O O [ O O O ]
  388. // ^^^^^^^^^ leftSize
  389. // rightSize ^^^^^^^^^
  390. // leftSize > minSize
  391. // rightSize > minSize
  392. // Perfect split: [ O O O ] [ O O O ]
  393. // right === left - 1
  394. if (left - 1 > right) {
  395. // We try to remove some problematic nodes to "fix" that
  396. let prevSize;
  397. if (right < group.nodes.length - left) {
  398. subtractSizeFrom(rightSize, group.nodes[right + 1].size);
  399. prevSize = rightSize;
  400. } else {
  401. subtractSizeFrom(leftSize, group.nodes[left - 1].size);
  402. prevSize = leftSize;
  403. }
  404. if (removeProblematicNodes(group, prevSize)) {
  405. // This changed something, so we try this group again
  406. queue.push(group);
  407. continue;
  408. }
  409. // can't split group while holding minSize
  410. // because minSize is preferred of maxSize we return
  411. // the problematic nodes as result here even while it's too big
  412. // To avoid this make sure maxSize > minSize * 3
  413. result.push(group);
  414. continue;
  415. }
  416. if (left <= right) {
  417. // when there is a area between left and right
  418. // we look for best split point
  419. // we split at the minimum similarity
  420. // here key space is separated the most
  421. // But we also need to make sure to not create too small groups
  422. let best = -1;
  423. let bestSimilarity = Infinity;
  424. let pos = left;
  425. let rightSize = sumSize(group.nodes.slice(pos));
  426. // pos v v right
  427. // [ O O O ] O O O [ O O O ]
  428. // ^^^^^^^^^ leftSize
  429. // rightSize ^^^^^^^^^^^^^^^
  430. while (pos <= right + 1) {
  431. const similarity = /** @type {number[]} */ (group.similarities)[
  432. pos - 1
  433. ];
  434. if (
  435. similarity < bestSimilarity &&
  436. !isTooSmall(leftSize, minSize) &&
  437. !isTooSmall(rightSize, minSize)
  438. ) {
  439. best = pos;
  440. bestSimilarity = similarity;
  441. }
  442. addSizeTo(leftSize, group.nodes[pos].size);
  443. subtractSizeFrom(rightSize, group.nodes[pos].size);
  444. pos++;
  445. }
  446. if (best < 0) {
  447. // This can't happen
  448. // but if that assumption is wrong
  449. // fallback to a big group
  450. result.push(group);
  451. continue;
  452. }
  453. left = best;
  454. right = best - 1;
  455. }
  456. // create two new groups for left and right area
  457. // and queue them up
  458. const rightNodes = [group.nodes[right + 1]];
  459. /** @type {number[]} */
  460. const rightSimilarities = [];
  461. for (let i = right + 2; i < group.nodes.length; i++) {
  462. rightSimilarities.push(
  463. /** @type {number[]} */ (group.similarities)[i - 1]
  464. );
  465. rightNodes.push(group.nodes[i]);
  466. }
  467. queue.push(new Group(rightNodes, rightSimilarities));
  468. const leftNodes = [group.nodes[0]];
  469. /** @type {number[]} */
  470. const leftSimilarities = [];
  471. for (let i = 1; i < left; i++) {
  472. leftSimilarities.push(
  473. /** @type {number[]} */ (group.similarities)[i - 1]
  474. );
  475. leftNodes.push(group.nodes[i]);
  476. }
  477. queue.push(new Group(leftNodes, leftSimilarities));
  478. }
  479. }
  480. }
  481. // lexically ordering
  482. result.sort((a, b) => {
  483. if (a.nodes[0].key < b.nodes[0].key) return -1;
  484. if (a.nodes[0].key > b.nodes[0].key) return 1;
  485. return 0;
  486. });
  487. // give every group a name
  488. const usedNames = new Set();
  489. for (let i = 0; i < result.length; i++) {
  490. const group = result[i];
  491. if (group.nodes.length === 1) {
  492. group.key = group.nodes[0].key;
  493. } else {
  494. const first = group.nodes[0];
  495. const last = group.nodes[group.nodes.length - 1];
  496. const name = getName(first.key, last.key, usedNames);
  497. group.key = name;
  498. }
  499. }
  500. // return the results
  501. return result.map(group => {
  502. /** @type {GroupedItems<T>} */
  503. return {
  504. key: group.key,
  505. items: group.nodes.map(node => node.item),
  506. size: group.size
  507. };
  508. });
  509. };