DocParse.cs 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. using BitMiracle.Docotic.Pdf;
  2. using Microsoft.AspNetCore.Components.Forms;
  3. using System;
  4. using System.Collections.Generic;
  5. using System.IO;
  6. using System.Linq;
  7. using System.Text;
  8. using System.Text.RegularExpressions;
  9. using System.Threading.Tasks;
  10. namespace HyperCube
  11. {
  12. public class DocParse
  13. {
  14. private static readonly long MAX_FILE_SIZE = 5120000; //bytes
  15. public static async Task<Dictionary<string, string>> ReadPDF(IBrowserFile file)
  16. {
  17. Console.WriteLine("ReadPDF start");
  18. Dictionary<string, string> docFields = new()
  19. {
  20. { "name", "" },
  21. { "date", "" },
  22. { "authors", "" },
  23. { "keywords", "" },
  24. { "annotation", "" },
  25. { "text", "" }
  26. };
  27. string text = "";
  28. Regex regex;
  29. MatchCollection matches;
  30. Stream stream = file.OpenReadStream(MAX_FILE_SIZE);
  31. MemoryStream ms = new();
  32. await stream.CopyToAsync(ms);
  33. ms.Position = 0;
  34. Console.WriteLine($"stream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
  35. PdfDocument pdf = new(ms);
  36. text = pdf.GetText(); //GetTextWithFormatting()
  37. ///getting article name
  38. ///getting publish date
  39. /// getting authors
  40. regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
  41. matches = regex.Matches(text);
  42. if (matches.Count > 0)
  43. {
  44. //foreach (Match match in matches)
  45. // Console.WriteLine(match.Value);
  46. string authors = string.Join(", ", from Match match in matches select match.Value);
  47. docFields["authors"] = authors;
  48. }
  49. else
  50. Console.WriteLine("authors not found");
  51. ///getting keywords
  52. //string testText = "Ключевые слова: ПНГ, углеродный след, газохи-" +
  53. // "мия, малая химия, легкие фракции углеводородов,\n" +
  54. // "неликвидные углеводороды.";
  55. regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
  56. matches = regex.Matches(text);
  57. if (matches.Count > 0)
  58. {
  59. //foreach (Match match in matches)
  60. // Console.WriteLine(match.Value);
  61. string keywords = string.Join(", ", from Match match in matches select match.Value);
  62. keywords = Regex.Replace(keywords, @"\s+", " ");
  63. keywords = Regex.Replace(keywords, "-", "");
  64. keywords = keywords.Substring(keywords.IndexOf(":") + 2);
  65. docFields["keywords"] = keywords;
  66. }
  67. else
  68. Console.WriteLine("keywords not found");
  69. ///getting annotation
  70. docFields["text"] = text;
  71. Console.WriteLine("ReadPDF end");
  72. return docFields;
  73. }
  74. public string ReadDocx()
  75. {
  76. StringBuilder pageText = new();
  77. return pageText.ToString();
  78. }
  79. private Dictionary<string, string> ParseTextData(string txtdata)
  80. {
  81. Dictionary<string, string> docAtributes = new();
  82. return docAtributes;
  83. }
  84. }
  85. }