DocParse.cs 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. using BitMiracle.Docotic.Pdf;
  2. using Microsoft.AspNetCore.Components.Forms;
  3. using System;
  4. using System.Collections.Generic;
  5. using System.IO;
  6. using System.Linq;
  7. using System.Text;
  8. using System.Text.RegularExpressions;
  9. using System.Threading.Tasks;
  10. namespace HyperCube
  11. {
  12. public class DocParse
  13. {
  14. private static readonly long MAX_FILE_SIZE = 5120000; //bytes
  15. public static async Task<Dictionary<string, string>> ReadPDF(IBrowserFile file)
  16. {
  17. Console.WriteLine("ReadPDF start");
  18. Dictionary<string, string> docFields = new()
  19. {
  20. { "name", "" },
  21. { "date", "" },
  22. { "authors", "" },
  23. { "keywords", "" },
  24. { "annotation", "" },
  25. { "text", "" }
  26. };
  27. string text = "";
  28. Regex regex;
  29. MatchCollection matches;
  30. Stream stream = file.OpenReadStream(MAX_FILE_SIZE);
  31. MemoryStream ms = new();
  32. await stream.CopyToAsync(ms);
  33. ms.Position = 0;
  34. Console.WriteLine($"stream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
  35. PdfDocument pdf = new(ms);
  36. text = pdf.GetText(); //GetTextWithFormatting()
  37. ///getting article name
  38. regex = new Regex(@"^.*?[\.!\?](?:\s|$)");
  39. matches = regex.Matches(text);
  40. if (matches.Count > 0)
  41. {
  42. string name = string.Join(", ", from Match match in matches select match.Value);
  43. docFields["name"] = name.Trim(); ;
  44. }
  45. else
  46. Console.WriteLine("keywords not found");
  47. ///getting publish date
  48. ///?
  49. /// getting authors
  50. regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
  51. matches = regex.Matches(text);
  52. if (matches.Count > 0)
  53. {
  54. //foreach (Match match in matches) Console.WriteLine(match.Value);
  55. string authors = string.Join(", ", from Match match in matches select match.Value);
  56. docFields["authors"] = authors;
  57. }
  58. else
  59. Console.WriteLine("authors not found");
  60. ///getting keywords
  61. regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
  62. matches = regex.Matches(text);
  63. if (matches.Count > 0)
  64. {
  65. string keywords = string.Join(", ", from Match match in matches select match.Value);
  66. keywords = Regex.Replace(keywords, @"\s+", " ");
  67. keywords = Regex.Replace(keywords, "-", "");
  68. keywords = keywords.Substring(keywords.IndexOf(":") + 2);
  69. docFields["keywords"] = keywords;
  70. }
  71. else
  72. Console.WriteLine("keywords not found");
  73. ///getting annotation
  74. ///?
  75. ///temp
  76. docFields["text"] = text;
  77. Console.WriteLine("ReadPDF end");
  78. return docFields;
  79. }
  80. public string ReadDocx()
  81. {
  82. StringBuilder pageText = new();
  83. return pageText.ToString();
  84. }
  85. private Dictionary<string, string> ParseTextData(string txtdata)
  86. {
  87. Dictionary<string, string> docAtributes = new();
  88. return docAtributes;
  89. }
  90. }
  91. }