using BitMiracle.Docotic.Pdf; using Microsoft.AspNetCore.Components.Forms; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace HyperCube { public class DocParse { private static readonly long MAX_FILE_SIZE = 5120000; //bytes public static async Task> ReadPDF(IBrowserFile file) { Console.WriteLine("ReadPDF start"); Dictionary docFields = new() { { "name", "" }, { "date", "" }, { "authors", "" }, { "keywords", "" }, { "annotation", "" }, { "text", "" } }; string text = ""; Regex regex; MatchCollection matches; Stream stream = file.OpenReadStream(MAX_FILE_SIZE); MemoryStream ms = new(); await stream.CopyToAsync(ms); ms.Position = 0; Console.WriteLine($"stream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}"); PdfDocument pdf = new(ms); text = pdf.GetText(); //GetTextWithFormatting() ///getting article name regex = new Regex(@"^.*?[\.!\?](?:\s|$)"); matches = regex.Matches(text); if (matches.Count > 0) { string name = string.Join(", ", from Match match in matches select match.Value); docFields["name"] = name.Trim(); ; } else Console.WriteLine("keywords not found"); ///getting publish date ///? /// getting authors regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}"); matches = regex.Matches(text); if (matches.Count > 0) { //foreach (Match match in matches) Console.WriteLine(match.Value); string authors = string.Join(", ", from Match match in matches select match.Value); docFields["authors"] = authors; } else Console.WriteLine("authors not found"); ///getting keywords regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase); matches = regex.Matches(text); if (matches.Count > 0) { string keywords = string.Join(", ", from Match match in matches select match.Value); keywords = Regex.Replace(keywords, @"\s+", " "); keywords = Regex.Replace(keywords, "-", ""); keywords = keywords.Substring(keywords.IndexOf(":") + 2); docFields["keywords"] = keywords; } else Console.WriteLine("keywords not found"); ///getting annotation ///? ///temp docFields["text"] = text; Console.WriteLine("ReadPDF end"); return docFields; } public string ReadDocx() { StringBuilder pageText = new(); return pageText.ToString(); } private Dictionary ParseTextData(string txtdata) { Dictionary docAtributes = new(); return docAtributes; } } }