DocParse.cs 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. using System;
  2. using System.IO;
  3. using System.Linq;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. using BitMiracle.Docotic.Pdf;
  7. using Console = HyperCube.Utils.AdvConsole;
  8. namespace HyperCube
  9. {
  10. public class DocParse
  11. {
  12. public static Models.ArticleModel ReadPDF(MemoryStream ms)
  13. {
  14. //Console.WriteLine("ReadPDF start");
  15. Models.ArticleModel articleModel = new();
  16. string text = "";
  17. Regex regex;
  18. MatchCollection matches;
  19. ms.Position = 0;
  20. //Console.WriteLine($"memorystream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
  21. PdfDocument pdf = new(ms);
  22. text = pdf.GetText(); //GetTextWithFormatting()
  23. ///getting article name
  24. regex = new Regex(@"^.*?[\.!\?](?:\s|$)");
  25. matches = regex.Matches(text);
  26. if (matches.Count > 0)
  27. {
  28. string name = string.Join(", ", from Match match in matches select match.Value);
  29. articleModel.Name = name.Trim();
  30. }
  31. else
  32. Console.WriteLine("cant get name");
  33. ///getting publish date
  34. ///?
  35. /// getting authors
  36. regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
  37. matches = regex.Matches(text);
  38. if (matches.Count > 0)
  39. {
  40. //foreach (Match match in matches) Console.WriteLine(match.Value);
  41. string authors = string.Join(", ", from Match match in matches select match.Value);
  42. articleModel.Authors = authors;
  43. }
  44. else
  45. Console.WriteLine("authors not found");
  46. ///getting keywords
  47. regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
  48. matches = regex.Matches(text);
  49. if (matches.Count > 0)
  50. {
  51. string keywords = string.Join(", ", from Match match in matches select match.Value);
  52. keywords = Regex.Replace(keywords, @"\s+", " ");
  53. keywords = Regex.Replace(keywords, "-", "");
  54. keywords = keywords.Substring(keywords.IndexOf(":") + 2);
  55. articleModel.Keywords = keywords;
  56. }
  57. else
  58. Console.WriteLine("keywords not found");
  59. ///getting annotation
  60. ///?
  61. articleModel.Text = text;
  62. //Console.WriteLine("ReadPDF end");
  63. return articleModel;
  64. }
  65. public static string ReadDocx()
  66. {
  67. StringBuilder pageText = new();
  68. return pageText.ToString();
  69. }
  70. }
  71. }