123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- using System;
- using System.IO;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- using BitMiracle.Docotic.Pdf;
- using Console = HyperCube.Utils.AdvConsole;
- namespace HyperCube
- {
- public class DocParse
- {
- public static Models.ArticleModel ReadPDF(MemoryStream ms)
- {
- //Console.WriteLine("ReadPDF start");
- Models.ArticleModel articleModel = new();
- string text = "";
- Regex regex;
- MatchCollection matches;
- ms.Position = 0;
- //Console.WriteLine($"memorystream length: {ms.Length}, canread: {ms.CanRead}, canseek:{ms.CanSeek}");
- PdfDocument pdf = new(ms);
- text = pdf.GetText(); //GetTextWithFormatting()
- ///getting article name
- regex = new Regex(@"^.*?[\.!\?](?:\s|$)");
- matches = regex.Matches(text);
- if (matches.Count > 0)
- {
- string name = string.Join(", ", from Match match in matches select match.Value);
- articleModel.Name = name.Trim();
- }
- else
- Console.WriteLine("cant get name");
- ///getting publish date
- ///?
- /// getting authors
- regex = new Regex(@"[А-Я]\.\s?[А-Я]\.\s[А-Я][а-я]{1,20}");
- matches = regex.Matches(text);
- if (matches.Count > 0)
- {
- //foreach (Match match in matches) Console.WriteLine(match.Value);
- string authors = string.Join(", ", from Match match in matches select match.Value);
- articleModel.Authors = authors;
- }
- else
- Console.WriteLine("authors not found");
- ///getting keywords
- regex = new Regex(@"(ключевые)\s*(слова:)\s[\w+\-+\w\,\s]*\.", RegexOptions.IgnoreCase);
- matches = regex.Matches(text);
- if (matches.Count > 0)
- {
- string keywords = string.Join(", ", from Match match in matches select match.Value);
- keywords = Regex.Replace(keywords, @"\s+", " ");
- keywords = Regex.Replace(keywords, "-", "");
- keywords = keywords.Substring(keywords.IndexOf(":") + 2);
- articleModel.Keywords = keywords;
- }
- else
- Console.WriteLine("keywords not found");
- ///getting annotation
- ///?
- articleModel.Text = text;
- //Console.WriteLine("ReadPDF end");
- return articleModel;
- }
- public static string ReadDocx()
- {
- StringBuilder pageText = new();
- return pageText.ToString();
- }
- }
- }
|