dupindex.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. /*
  2. * Software Index, Copyright 2010, Software Index Project Team
  3. * Link: http://swi.sourceforge.net
  4. *
  5. * This file is part of Software Index Tool.
  6. *
  7. * Software Index is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation, version 3 of the License.
  10. *
  11. * Software Index is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with Software Index. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <algorithm>
  20. #include <string>
  21. #include <vector>
  22. static const char SWI_EOF = '\x7'; // ASCII code 7 is used as EOF marker
  23. static const int SWI_MAX_TEXT_LENGTH = 10000;
  24. /**
  25. * Container for sortable stuff
  26. * Strings are not moved, just bookmarks are sorted
  27. */
  28. struct SwiBookmark
  29. {
  30. int m_possition;
  31. const char* m_text;
  32. // sort algorithm requires to define this operator
  33. bool operator<(const SwiBookmark& another) const
  34. {
  35. return strcmp(another.m_text, m_text) < 0;
  36. }
  37. };
  38. /**
  39. * Container for the file/function record
  40. */
  41. struct SwiRecord
  42. {
  43. const char* functionName;
  44. const char* fileName;
  45. int end;
  46. };
  47. /**
  48. * Returns line number in intial record
  49. * from the begging till the symbol by offset
  50. */
  51. static int swiLineNumberGet(const char* text,
  52. int offset,
  53. SwiRecord files[],
  54. int fileIndex)
  55. {
  56. const int start = (fileIndex == 0) ? 0 : files[fileIndex - 1].end;
  57. int line = 1;
  58. for (int i = start; i < offset; ++i)
  59. {
  60. if (text[i] == '\n')
  61. {
  62. ++line;
  63. }
  64. }
  65. return line;
  66. }
  67. /**
  68. * Adds a character to the processed string,
  69. * initializes new bookmark, and returns it
  70. */
  71. static SwiBookmark swiProcessedCharacterAdd(char c, int originalPossition, char* processedText)
  72. {
  73. static int processedPossition;
  74. SwiBookmark bookmark;
  75. bookmark.m_possition = originalPossition;
  76. bookmark.m_text = processedText + processedPossition;
  77. processedText[processedPossition++] = c;
  78. return bookmark;
  79. }
  80. /**
  81. * Prints to stdout the found items
  82. * The format of stdout is strictly defined by the caller: SWI/PROCESSOR
  83. * Change it synchroniously with SWI/PROCESSOR code
  84. */
  85. static void swiResultPrint(SwiBookmark bookmark,
  86. SwiRecord files[],
  87. const char* originalText,
  88. int numOfDuplicatedSymbols)
  89. {
  90. int recordIndex = 0;
  91. while (files[recordIndex].end <= bookmark.m_possition)
  92. {
  93. ++recordIndex;
  94. }
  95. printf("duplication: file: '%s' function: '%s' possition: '%d' size: '%d' \n",
  96. files[recordIndex].fileName,
  97. files[recordIndex].functionName,
  98. swiLineNumberGet(originalText, bookmark.m_possition, files, recordIndex),
  99. numOfDuplicatedSymbols);
  100. }
  101. /**
  102. * Returns number of symbols which are equal from the beggining
  103. */
  104. static int swiStringsCompare(SwiBookmark bookmarkFirst,
  105. SwiBookmark bookmarkSecond)
  106. {
  107. int pos = 0;
  108. for (; bookmarkFirst.m_text[pos] == bookmarkSecond.m_text[pos]; ++pos)
  109. {
  110. if (bookmarkFirst.m_text[pos] == SWI_EOF)
  111. {
  112. break;
  113. }
  114. }
  115. return pos;
  116. }
  117. /**
  118. * Checks if two strings are equal in the defined range of symbol's indexes:
  119. * from the first symbol till upperLimit
  120. */
  121. static bool swiStringsEqualCheck(SwiBookmark bookmarkFirst,
  122. SwiBookmark bookmarkSecond,
  123. int upperLimit,
  124. const char* endPos)
  125. {
  126. if (bookmarkFirst.m_text + upperLimit >= endPos ||
  127. bookmarkSecond.m_text + upperLimit >= endPos)
  128. {
  129. return false;
  130. }
  131. for (int i = upperLimit; i >= 0; --i)
  132. {
  133. if (bookmarkFirst.m_text[i] != bookmarkSecond.m_text[i])
  134. {
  135. return false;
  136. }
  137. }
  138. return true;
  139. }
  140. /**
  141. * Scans the original text into a processed text, which is returned.
  142. * The processed string does not include the ignorable symbols
  143. * The list of ignorable symbols is configured (usualy spaces)
  144. * Bookmarks are installed for every newline
  145. * if it begins from the 'regular' symbol
  146. * The list of non-regular symbols is configred
  147. */
  148. static const char* swiOriginalStringProcess(std::string originalText,
  149. std::vector<SwiBookmark>& bookmarks,
  150. const char * ignorableSymbols,
  151. const char * nonregularSymbols)
  152. {
  153. char* const processedText = new char[originalText.length()];
  154. bool isNewlineDetected = true;
  155. for (unsigned int i = 0; i < originalText.length(); ++i)
  156. {
  157. const char c = originalText[i];
  158. bool isIgnorable = false;
  159. bool isRegular = true;
  160. // Processed text should not include the end-null symbol
  161. if (c == '\0')
  162. {
  163. continue;
  164. }
  165. if (c == SWI_EOF)
  166. {
  167. swiProcessedCharacterAdd(c, i, processedText);
  168. continue;
  169. }
  170. for (int j = 0; ignorableSymbols[j] != '\0'; ++j)
  171. {
  172. if (ignorableSymbols[j] == c)
  173. {
  174. isIgnorable = true;
  175. }
  176. }
  177. for (int j = 0; nonregularSymbols[j] != '\0'; ++j)
  178. {
  179. if (nonregularSymbols[j] == c)
  180. {
  181. isRegular = false;
  182. }
  183. }
  184. if (!isIgnorable)
  185. {
  186. if (isNewlineDetected && isRegular)
  187. {
  188. bookmarks.push_back(swiProcessedCharacterAdd(c, i, processedText));
  189. }
  190. else
  191. {
  192. swiProcessedCharacterAdd(c, i, processedText);
  193. }
  194. isNewlineDetected = false;
  195. }
  196. if (c == '\n')
  197. {
  198. isNewlineDetected = true;
  199. }
  200. }
  201. swiProcessedCharacterAdd('\0', originalText.length(), processedText);
  202. return processedText;
  203. }
  204. /**
  205. * Removes bookmarks which are in reported area already
  206. * It helps to prevent the unexpected reporting of the duplicatedSymbolsCount markers more than once
  207. */
  208. static void swiBookmarksShift(std::vector<SwiBookmark>& bookmarks)
  209. {
  210. unsigned int indexFrom = 0;
  211. for (unsigned int indexTo = 0; indexTo < bookmarks.size() - 1; ++indexTo)
  212. {
  213. if (bookmarks[indexTo].m_text == 0)
  214. {
  215. if (indexFrom <= indexTo)
  216. {
  217. indexFrom = indexTo + 1;
  218. }
  219. while (indexFrom < bookmarks.size() &&
  220. bookmarks[indexFrom].m_text == 0)
  221. {
  222. ++indexFrom;
  223. }
  224. if (indexFrom == bookmarks.size())
  225. {
  226. break;
  227. }
  228. bookmarks[indexTo]= bookmarks[indexFrom];
  229. bookmarks[indexFrom].m_text = 0;
  230. }
  231. }
  232. }
  233. /**
  234. * Programm entry point
  235. */
  236. int main(int argc, char* argv[])
  237. {
  238. std::vector<SwiRecord> files(0);
  239. std::string commonString = "";
  240. int minLength = 100;
  241. int proximityFactor = 100;
  242. char * ignorabaleSymbols = new char[SWI_MAX_TEXT_LENGTH];
  243. char * nonRegularSymbols = new char[SWI_MAX_TEXT_LENGTH];
  244. strcpy(ignorabaleSymbols, " \t\n");
  245. strcpy(nonRegularSymbols, "}");
  246. while (1)
  247. {
  248. char * command = new char[SWI_MAX_TEXT_LENGTH];
  249. scanf("%s", command);
  250. if (strcmp(command, "start") == 0)
  251. {
  252. // Continue search algorithm
  253. break;
  254. }
  255. if (strcmp(command, "exit") == 0)
  256. {
  257. // Breaking the search
  258. exit(EXIT_SUCCESS);
  259. }
  260. if (strcmp(command, "init_length") == 0)
  261. {
  262. scanf("%d", &minLength);
  263. }
  264. if (strcmp(command, "init_proximity") == 0)
  265. {
  266. scanf("%d", &proximityFactor);
  267. if (proximityFactor < 1 || proximityFactor > 100)
  268. {
  269. fprintf(stderr, "error: Proximity factor must be between 1 and 100 (inclusive)\n");
  270. exit(EXIT_FAILURE);
  271. }
  272. }
  273. if (strcmp(command, "init_ignorable") == 0)
  274. {
  275. // TODO: no ways to configure newline symbol
  276. scanf("%s", ignorabaleSymbols);
  277. }
  278. if (strcmp(command, "init_nonregular") == 0)
  279. {
  280. // TODO: no ways to configure hewline symbol
  281. scanf("%s", nonRegularSymbols);
  282. }
  283. if (strcmp(command, "init_file") == 0)
  284. {
  285. unsigned int fileLength = 0;
  286. unsigned int functionLength = 0;
  287. unsigned int textLength = 0;
  288. scanf("%d", &fileLength);
  289. scanf("%d", &functionLength);
  290. scanf("%d", &textLength);
  291. char * file = new char[fileLength + 1];
  292. char * function = new char[functionLength + 1];
  293. char * text = new char[textLength + 1 + 1]; // +1 for null pointer, +1 for EOF symbol
  294. file[fileLength] = '\0';
  295. function[functionLength] = '\0';
  296. text[textLength] = SWI_EOF;
  297. text[textLength + 1] = '\0';
  298. fread(file, 1, 1, stdin); // Consume one empty symbol
  299. fread(file, fileLength, 1, stdin);
  300. fread(function, functionLength, 1, stdin);
  301. fread(text, textLength, 1, stdin);
  302. commonString += std::string(text);
  303. SwiRecord record;
  304. record.end = commonString.length();
  305. record.functionName = function;
  306. record.fileName = file;
  307. files.push_back(record);
  308. delete text;
  309. }
  310. delete command;
  311. }
  312. if (commonString.length() == 0)
  313. {
  314. fprintf(stderr, "error: No files defined or they are empty\n");
  315. exit(EXIT_FAILURE);
  316. }
  317. std::vector<SwiBookmark> bookmarks;
  318. const char* processedText =
  319. swiOriginalStringProcess(commonString, bookmarks, ignorabaleSymbols, nonRegularSymbols);
  320. const char* processedEnd = processedText + strlen(processedText);
  321. std::sort(bookmarks.begin(), bookmarks.end());
  322. while(1)
  323. {
  324. int longestDuplication = 0;
  325. int firstInstanceIndex = 0;
  326. // Find the two bookmarks that have the longest common substring.
  327. for (unsigned int bookmarkIndex = 0; bookmarkIndex < bookmarks.size() - 1; ++bookmarkIndex)
  328. {
  329. if (bookmarks[bookmarkIndex + 1].m_text == 0)
  330. {
  331. break;
  332. }
  333. if (swiStringsEqualCheck(bookmarks[bookmarkIndex], bookmarks[bookmarkIndex + 1],
  334. longestDuplication, processedEnd))
  335. {
  336. const int duplicatedSymbolsCount = swiStringsCompare(bookmarks[bookmarkIndex],
  337. bookmarks[bookmarkIndex + 1]);
  338. if (duplicatedSymbolsCount > longestDuplication)
  339. {
  340. firstInstanceIndex = bookmarkIndex;
  341. longestDuplication = duplicatedSymbolsCount;
  342. }
  343. }
  344. }
  345. if (longestDuplication < minLength)
  346. {
  347. // Do not process too short strings
  348. // This is the exit from the while loop
  349. break;
  350. }
  351. int numberOfInstances = 2;
  352. int almostLongestDuplication = (longestDuplication * proximityFactor) / 100;
  353. int initialIndexForScan = firstInstanceIndex;
  354. // // Search for duplicated strings before the current pair.
  355. for (int i = initialIndexForScan - 1; i >= 0; --i)
  356. {
  357. const int duplicatedSymbolsCount =
  358. swiStringsCompare(bookmarks[initialIndexForScan], bookmarks[i]);
  359. if (duplicatedSymbolsCount < almostLongestDuplication)
  360. {
  361. break;
  362. }
  363. numberOfInstances++;
  364. if (longestDuplication > duplicatedSymbolsCount)
  365. {
  366. longestDuplication = duplicatedSymbolsCount;
  367. }
  368. firstInstanceIndex = i;
  369. }
  370. // Search for duplicated strings after the current pair.
  371. for (unsigned int i = initialIndexForScan + 2; i < bookmarks.size(); ++i)
  372. {
  373. if (bookmarks[i].m_text == 0)
  374. {
  375. break;
  376. }
  377. const int duplicatedSymbolsCount = swiStringsCompare(bookmarks[initialIndexForScan],
  378. bookmarks[i]);
  379. if (duplicatedSymbolsCount < almostLongestDuplication)
  380. {
  381. break;
  382. }
  383. numberOfInstances++;
  384. if (longestDuplication > duplicatedSymbolsCount)
  385. {
  386. longestDuplication = duplicatedSymbolsCount;
  387. }
  388. }
  389. printf("info: group_start\n");
  390. for (int i = 0; i < numberOfInstances; ++i)
  391. {
  392. swiResultPrint(bookmarks[firstInstanceIndex + i], &files[0],
  393. commonString.c_str(), longestDuplication);
  394. // Clear bookmarks that point out to the reported area.
  395. const char* reportStart =
  396. bookmarks[firstInstanceIndex + i].m_text;
  397. for (unsigned int j = 0; j < bookmarks.size() - 1; ++j)
  398. {
  399. if (bookmarks[j].m_text >= reportStart &&
  400. bookmarks[j].m_text < reportStart + longestDuplication)
  401. {
  402. bookmarks[j].m_text = 0;
  403. }
  404. }
  405. }
  406. printf("\n");
  407. swiBookmarksShift(bookmarks);
  408. }
  409. delete [] processedText;
  410. return 0;
  411. }