DocParse.cs 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. using System;
  2. using System.IO;
  3. using System.Linq;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. using BitMiracle.Docotic.Pdf;
  7. namespace HyperCube
  8. {
  9. public class DocParse
  10. {
  11. public static Models.ArticleModel ReadPDF(MemoryStream ms)
  12. {
  13. //Console.WriteLine("ReadPDF start");
  14. Models.ArticleModel articleModel = new();
  15. string text = "";
  16. Regex regex;
  17. MatchCollection matches;
  18. ms.Position = 0;
  19. //Console.WriteLine($"memorystream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
  20. PdfDocument pdf = new(ms);
  21. text = pdf.GetText(); //GetTextWithFormatting()
  22. ///getting article name
  23. regex = new Regex(@"^.*?[\.!\?](?:\s|$)");
  24. matches = regex.Matches(text);
  25. if (matches.Count > 0)
  26. {
  27. string name = string.Join(", ", from Match match in matches select match.Value);
  28. articleModel.Name = name.Trim();
  29. }
  30. else
  31. Console.WriteLine("cant get name");
  32. ///getting publish date
  33. ///?
  34. /// getting authors
  35. regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
  36. matches = regex.Matches(text);
  37. if (matches.Count > 0)
  38. {
  39. //foreach (Match match in matches) Console.WriteLine(match.Value);
  40. string authors = string.Join(", ", from Match match in matches select match.Value);
  41. articleModel.Authors = authors;
  42. }
  43. else
  44. Console.WriteLine("authors not found");
  45. ///getting keywords
  46. regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
  47. matches = regex.Matches(text);
  48. if (matches.Count > 0)
  49. {
  50. string keywords = string.Join(", ", from Match match in matches select match.Value);
  51. keywords = Regex.Replace(keywords, @"\s+", " ");
  52. keywords = Regex.Replace(keywords, "-", "");
  53. keywords = keywords.Substring(keywords.IndexOf(":") + 2);
  54. articleModel.Keywords = keywords;
  55. }
  56. else
  57. Console.WriteLine("keywords not found");
  58. ///getting annotation
  59. ///?
  60. articleModel.Text = text;
  61. //Console.WriteLine("ReadPDF end");
  62. return articleModel;
  63. }
  64. public static string ReadDocx()
  65. {
  66. StringBuilder pageText = new();
  67. return pageText.ToString();
  68. }
  69. }
  70. }